From 5ee1cb421027899d8e65595263c66d5277324e84 Mon Sep 17 00:00:00 2001 From: Peter Date: Mon, 10 Nov 2025 08:18:56 -0600 Subject: [PATCH 1/3] hipblaslt: add adaptive gemm exact tuning for tf32 --- ...lk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml | 21032 ++++++++-------- ...gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml | 11501 +++++++++ ...lk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml | 17950 ++++++------- ...gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml | 14733 +++++++++++ ...ik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml | 19056 +++++++------- ...gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml | 15142 +++++++++++ 6 files changed, 70464 insertions(+), 28950 deletions(-) create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index f86ee27d885..ddd3f13a5e9 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -82,6 +82,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -131,7 +132,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 32 LSCB: 32 @@ -240,7 +241,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -318,6 +319,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -325,19 +327,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x192x32_MI32N23IjcAtXY3gS9sbPso0pabe5MBq0S1TE7vcRd-meK0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x9RnxenTyJ8nNAkQ5MP4CciTM97xuXar2sESVoWHLOSA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false @@ -367,45 +369,45 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB5_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 24576 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -415,15 +417,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] + MIWaveGroup: [1, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -443,22 +445,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 5 + NonTemporalA: 5 + NonTemporalB: 1 NonTemporalC: 3 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -476,13 +478,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB5_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -491,17 +493,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 2 SubGroup1: 64 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 3 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 3 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -526,22 +528,22 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -554,6 +556,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -561,20 +564,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x9RnxenTyJ8nNAkQ5MP4CciTM97xuXar2sESVoWHLOSA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xFyuxAoGvph_2vmowDsHYGcRvobf86RQ55Pjx9h9U33c= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -591,7 +594,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -603,47 +606,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -651,15 +654,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -673,28 +676,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 + NonTemporalA: 7 + NonTemporalB: 0 NonTemporalC: 3 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -712,31 +715,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 2 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -755,16 +758,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -790,6 +793,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -797,20 +801,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xyRDk7sX5vsMSJppic6TLF_4te7mEav7TatLixdQJT30= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xoHfE-v637ME_GaV2g0jV2lBKdWqGLuzENP4MZ6HR9hI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -839,34 +843,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC2_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -874,10 +878,10 @@ LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -888,14 +892,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -909,27 +913,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -948,479 +952,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC2_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xFyuxAoGvph_2vmowDsHYGcRvobf86RQ55Pjx9h9U33c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 - LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xoHfE-v637ME_GaV2g0jV2lBKdWqGLuzENP4MZ6HR9hI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 - LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -1498,6 +1030,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1547,7 +1080,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB0_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB0_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 LSCA: 64 LSCB: 64 @@ -1655,8 +1188,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB0_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB0_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -1734,6 +1267,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1783,7 +1317,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 128 LSCB: 256 @@ -1891,8 +1425,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -1970,6 +1504,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2019,7 +1554,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA5_NTB2_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA5_NTB2_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 256 LSCB: 128 @@ -2127,8 +1662,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA5_NTB2_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA5_NTB2_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -2206,6 +1741,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2255,7 +1791,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 256 LSCB: 64 @@ -2363,8 +1899,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -2442,6 +1978,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2449,7 +1986,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32N382wOe_VdwYzQeWhtrhRLLcvxRf6wbNhOQ8fX8rqeY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI32oHDfc1w9ZJ59VVCd63Cw6zcGNYOdVz-kbsA3B5E6qt0= BufferLoad: true BufferStore: true CUCount: null @@ -2460,9 +1997,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2473,16 +2010,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -2491,47 +2028,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 64 + LSPA: 1 + LSPB: 4 + LVCA: 256 + LVCB: 64 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -2540,14 +2077,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2561,28 +2098,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 1 NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalC: 2 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 32 + NumLoadsB: 24 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2599,9 +2136,9 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -2609,7 +2146,7 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 @@ -2622,16 +2159,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2643,15 +2180,15 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 32 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -2668,8 +2205,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -2678,6 +2215,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2685,20 +2223,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI32cuaU17NT-6RkoTgdi0RTVYFwHV-7hZQoppoCFEA5jZ0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x64_MI32CBCon1f_jjG7kFWvrki0k9KCkvWV_6aitgnZVLy2uLQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true + DepthU: 64 + DirectToLds: 0 DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2727,34 +2265,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 32 + LSCA: 64 LSCB: 128 - LSPA: 32 + LSPA: 16 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 32 - LVPA: 8 + LVPA: 4 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 102400 + LdsBytesNoAmax: 163840 LdsInitCVgprs: false - LdsNumBytes: 102400 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 163840 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 81920 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 131072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 131072 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -2763,9 +2301,9 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -2775,14 +2313,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 1] - MIWaveTileA: 5 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 192 MacroTile1: 128 - MacroTileA: 160 + MacroTileA: 192 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -2803,22 +2341,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 6 + NonTemporalB: 3 NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 - NumLoadsCoalescedA: 5 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 12 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2835,33 +2373,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 1 - ThreadTileA: 80 - ThreadTileB: 1 + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2880,28 +2418,28 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -2910,10 +2448,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2921,31 +2460,31 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32BPvwrwOPWLMSGu9IDc3jxT3nNImpX5CzCmoRL1pKfX0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16xTk4t1GYEO8dXLFvvy6qawZiyVf4SLCuxBjKBQNdkFJo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true + DirectToLds: 0 DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -2953,7 +2492,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2963,35 +2502,35 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 LSCB: 128 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 4 + LVPA: 16 LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 18944 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 24576 + LdsNumBytes: 18944 + LdsNumElementsAlignedA: 2560 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 35328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 - LdsPadA: 0 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 35328 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -2999,36 +2538,36 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 + MacroTile0: 16 MacroTile1: 128 - MacroTileA: 192 + MacroTileA: 16 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -3039,19 +2578,19 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 + NonTemporalA: 1 + NonTemporalB: 7 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 @@ -3062,8 +2601,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3071,18 +2610,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -3094,9 +2633,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 48 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -3122,9 +2661,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3140,8 +2679,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -3150,6 +2689,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3157,7 +2697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI32oHDfc1w9ZJ59VVCd63Cw6zcGNYOdVz-kbsA3B5E6qt0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x0N1POJCZet_LVsGUYqIrwKXt1eieh3pBXSGbIuD3g3g= BufferLoad: true BufferStore: true CUCount: null @@ -3167,10 +2707,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3181,16 +2721,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -3199,47 +2739,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB0_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 LSCB: 64 - LSPA: 1 - LSPB: 4 - LVCA: 256 - LVCB: 64 - LVPA: 1 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -3247,15 +2787,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3269,27 +2809,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 3 NonTemporalB: 0 NonTemporalC: 2 - NonTemporalD: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 32 - NumLoadsB: 24 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 32 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -3307,39 +2847,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC4_WGMXCCGn1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB0_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3351,23 +2891,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -3376,8 +2916,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -3386,6 +2926,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3393,12 +2934,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32EbHBmJf1RwwjS-JHuju7-HwNAkADz1-rDSkQbaangMI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32ZMmu-bzOKlbYjIX7s6koUkT8HNmefLN2P0SO5Z-Nnko= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -3417,16 +2958,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -3435,34 +2976,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 256 - LSCB: 128 - LSPA: 1 - LSPB: 2 - LVCA: 256 - LVCB: 128 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 1 - LVPB: 2 + LVPB: 1 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 116736 + LdsNumBytes: 135168 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 67584 LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -3484,14 +3025,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 256 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3511,22 +3052,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 16 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3543,22 +3084,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -3567,9 +3108,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 64 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 64 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3588,7 +3129,7 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -3596,7 +3137,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3609,11 +3150,11 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -3622,6 +3163,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3629,7 +3171,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x64_MI32CBCon1f_jjG7kFWvrki0k9KCkvWV_6aitgnZVLy2uLQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x64_MI32x3i3wgIkgjfqD9uXc_vVgFeJgfaL8Y6k8yzx4nmpWsnuI= BufferLoad: true BufferStore: true CUCount: null @@ -3671,47 +3213,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 163840 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 163840 - LdsNumElementsAlignedA: 49152 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 81920 - LdsOffsetB: 49152 - LdsOffsetB_Blk: 131072 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49152 - LdsOffsetMetadata_Blk: 131072 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -3719,15 +3261,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3747,22 +3289,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 - NumLoadsB: 8 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3771,7 +3313,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3779,33 +3321,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3824,15 +3366,15 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3848,16 +3390,17 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3865,12 +3408,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16xTk4t1GYEO8dXLFvvy6qawZiyVf4SLCuxBjKBQNdkFJo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI3284eUjDzim0Vb_W96BcV5gyLtY2Bs0qmDakwEoN2Byco= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -3885,19 +3428,19 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -3907,36 +3450,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 2048 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 35328 - LdsPadA: 16 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -3944,35 +3487,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 128 - MacroTileA: 16 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -3983,22 +3526,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4006,8 +3549,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4015,22 +3558,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -4038,16 +3581,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4059,16 +3602,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4090,10 +3633,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4101,31 +3645,31 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16x0_c2bicLSoaL5ucbtdGYwt9FDH0ijAaEQ7zoll3TbDU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32WBJlp7tLm-Q3nBoJMo5z21xK-EwQTfjK_G03HnBD9dU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -4133,7 +3677,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -4143,35 +3687,35 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA4_NTB7_NTC0_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 + LSCA: 64 LSCB: 128 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 16 + LVPA: 4 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 2560 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 24576 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 35328 - LdsPadA: 16 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -4179,36 +3723,36 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 192 MacroTile1: 128 - MacroTileA: 16 + MacroTileA: 192 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4219,19 +3763,19 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 + NonTemporalA: 0 + NonTemporalB: 3 NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 @@ -4242,8 +3786,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4251,22 +3795,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA4_NTB7_NTC0_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -4274,9 +3818,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 48 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 48 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -4302,8 +3846,8 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -4326,10 +3870,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4337,12 +3882,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x30fo2zv2H_H7usihB88nJCRmxw5zj2fAW_9XjCZCib_U= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32bDs7ERukWCJnKPCbYUdj9elgilVVVE9QLJjd6V3PYSM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -4361,16 +3906,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -4379,36 +3924,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB5_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 - LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 128 + LSPA: 1 + LSPB: 2 + LVCA: 256 + LVCB: 128 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116736 + LdsInitCVgprs: false + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4419,7 +3964,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -4428,14 +3973,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4455,22 +4000,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4487,18 +4032,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB5_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -4510,16 +4055,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4531,8 +4076,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -4540,7 +4085,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4556,8 +4101,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -4566,6 +4111,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4573,7 +4119,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT384x96x32_MI32xMp1LLxd-AbguO2fIODet7-mR4XQwMGN5Q3-AdL8vDuY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI323WKz5cIoQJzJkl1Y9sYbIyhvMjcuf8cv5vryis9qKpc= BufferLoad: true BufferStore: true CUCount: null @@ -4603,7 +4149,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -4615,36 +4161,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA1_NTB4_NTC5_NTD2_NTM0_NEPBS0_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 49152 - LdsNumElementsAlignedB: 12288 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 49152 - LdsOffsetB_Blk: 114688 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49152 - LdsOffsetMetadata_Blk: 114688 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4663,15 +4209,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 96 - MacroTileA: 384 - MacroTileB: 96 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4691,22 +4237,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 12 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4723,39 +4269,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA1_NTB4_NTC5_NTD2_NTM0_NEPBS0_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM24_WGMXCC16_WGMXCCGn1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 3 - ThreadTileA: 48 - ThreadTileB: 3 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4767,16 +4313,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 16 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4798,10 +4344,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4809,20 +4356,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xOycBr-nJqkLN04wSTXD2H--ROwgtIJdefEMIAR1Kids= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xYXiQ01SsEx7M8YMRarVLDIWv1Jq14florxH1dF7qE1g= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4833,15 +4380,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -4851,99 +4398,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB7_NTC0_NTD6_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2560 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 27648 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 27648 + LdsNumElementsAlignedA: 6656 + LdsNumElementsAlignedB: 20992 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 6656 + LdsOffsetB_Blk: 39424 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 39424 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 48 + MacroTile1: 160 + MacroTileA: 48 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 5 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 12 + NumLoadsB: 10 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -4951,7 +4498,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4959,33 +4506,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC16_WGMXCCGn1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB7_NTC0_NTD6_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5003,41 +4550,42 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5045,7 +4593,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x48x128_MI16xJftuo0E_G-xvDYhaHanhHEFiHU6M0rjT4KfIwGmaRvk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xpjXnNPQ1cCsbAbLFpk8IuW-bxkhtk3OqekdPjWzUVgM= BufferLoad: true BufferStore: true CUCount: null @@ -5055,7 +4603,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -5069,15 +4617,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -5087,39 +4635,39 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 768 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2560 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 108544 + LdsBytesNoAmax: 27648 LdsInitCVgprs: false - LdsNumBytes: 108544 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 26624 + LdsNumBytes: 27648 + LdsNumElementsAlignedA: 6656 + LdsNumElementsAlignedB: 20992 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 6656 + LdsOffsetB_Blk: 39424 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 39424 + LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -5127,7 +4675,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5135,15 +4683,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 48 - MacroTileA: 32 - MacroTileB: 48 + MacroTile0: 48 + MacroTile1: 160 + MacroTileA: 48 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5163,23 +4711,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 4 + NonTemporalA: 5 + NonTemporalB: 6 + NonTemporalC: 1 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 12 + NumLoadsB: 10 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 5 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -5195,33 +4743,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5239,23 +4787,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5274,6 +4822,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5281,20 +4830,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x0N1POJCZet_LVsGUYqIrwKXt1eieh3pBXSGbIuD3g3g= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xdzK9JUaqXTNoF2Ac01jkgAeGIlt9zYm7jAEtGmpTIwY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5311,7 +4860,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -5323,98 +4872,98 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB0_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 61952 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 61952 + LdsNumElementsAlignedA: 20992 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 53760 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 53760 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 160 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 160 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 7 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 5 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5431,32 +4980,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB0_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 20 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 20 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -5475,29 +5024,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -5506,10 +5055,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5517,12 +5067,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32ZMmu-bzOKlbYjIX7s6koUkT8HNmefLN2P0SO5Z-Nnko= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x64x32_MI16xwgOGIlbHSY-6NN6XLdmF7L-EWnBxY0nUWipOv7yd_Fk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -5537,17 +5087,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -5559,36 +5109,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA6_NTB4_NTC0_NTD5_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 45056 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35840 LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 + LdsOffsetMetadata: 45056 LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -5596,11 +5146,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -5608,23 +5158,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -5635,22 +5185,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 6 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 7 + NumLoadsB: 4 + NumLoadsCoalescedA: 7 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5658,8 +5208,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5667,8 +5217,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA6_NTB4_NTC0_NTD5_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5676,24 +5226,24 @@ StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5711,16 +5261,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5746,6 +5296,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5753,20 +5304,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x64_MI32x3i3wgIkgjfqD9uXc_vVgFeJgfaL8Y6k8yzx4nmpWsnuI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1P79SndavTGvem3QTcxe5avntrlbrnZffKcV66eEHhVU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5778,7 +5329,7 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -5786,7 +5337,7 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -5795,48 +5346,48 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB5_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPB: 8 + LVCA: 4 + LVCB: 16 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 24576 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -5844,50 +5395,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 3] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 96 - MacroTileA: 32 - MacroTileB: 96 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 + NonTemporalA: 2 + NonTemporalB: 5 NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -5903,14 +5454,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB5_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -5918,18 +5469,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5954,9 +5505,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5982,6 +5533,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5989,20 +5541,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x80x64_MI16x1Gc6DcRCckwHfmBpQBlKs8kImvAws_A5ImLAW0mdPiFY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1GRq0J0BIf9_kOmRivWvl-M_wy7XxwpHY5TtqQ7BcIXE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6013,16 +5565,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -6031,47 +5583,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1280_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 1 - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 64 + LSPA: 8 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1280 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103424 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 103424 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 21504 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -6079,15 +5631,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 80 - MacroTileA: 64 - MacroTileB: 80 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6101,29 +5653,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 + NonTemporalA: 3 + NonTemporalB: 5 NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 10 - NumLoadsA: 4 - NumLoadsB: 5 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 5 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -6131,7 +5683,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6139,33 +5691,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1280_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6183,16 +5735,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6218,6 +5770,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6225,7 +5778,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI3284eUjDzim0Vb_W96BcV5gyLtY2Bs0qmDakwEoN2Byco= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1d8vD8HZkReziZDPDfy_jTNMHwhgwUmOg1B_nL57bLcY= BufferLoad: true BufferStore: true CUCount: null @@ -6235,10 +5788,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6249,16 +5802,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -6267,99 +5820,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -6367,7 +5920,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6375,39 +5928,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6419,23 +5972,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6454,6 +6007,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6461,7 +6015,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32WBJlp7tLm-Q3nBoJMo5z21xK-EwQTfjK_G03HnBD9dU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1d0IEB8OKcqBBBF-4yx_CbFvCcHAv99Kn-9PNgMVg4Do= BufferLoad: true BufferStore: true CUCount: null @@ -6471,9 +6025,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false @@ -6485,16 +6039,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -6503,99 +6057,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 0 - LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCA: 16 + LSCB: 16 + LSPA: 8 LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 0 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 1 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -6603,7 +6157,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6611,13 +6165,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -6626,18 +6180,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6656,22 +6210,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6690,6 +6244,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6697,20 +6252,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32bDs7ERukWCJnKPCbYUdj9elgilVVVE9QLJjd6V3PYSM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1wJ0_010zQ1ozDEKij-mIG9KmyLbncyS2dBVtU7Gx8vc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6727,7 +6282,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6739,99 +6294,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 1 - LSCA: 256 - LSCB: 128 - LSPA: 1 - LSPB: 2 - LVCA: 256 - LVCB: 128 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 512 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -6839,7 +6394,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6847,39 +6402,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6891,23 +6446,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6926,6 +6481,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6933,7 +6489,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI323WKz5cIoQJzJkl1Y9sYbIyhvMjcuf8cv5vryis9qKpc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x15ubexBXumzCQ5oYriKs0bC1QIR5F5xgSZw0beSsLJOA= BufferLoad: true BufferStore: true CUCount: null @@ -6943,10 +6499,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6958,15 +6514,15 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -6975,99 +6531,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 + NonTemporalA: 2 + NonTemporalB: 6 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -7075,7 +6631,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7083,39 +6639,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -7127,23 +6683,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7158,10 +6714,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7169,20 +6726,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xYXiQ01SsEx7M8YMRarVLDIWv1Jq14florxH1dF7qE1g= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1TchkKUPeyaTDsftqLBdXFb97CfgH3KVHDT_kuosD6JY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7194,7 +6751,7 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -7202,7 +6759,7 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -7211,43 +6768,43 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB7_NTC0_NTD6_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 1 LSCA: 16 - LSCB: 32 + LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 2560 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27648 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 27648 - LdsNumElementsAlignedA: 6656 - LdsNumElementsAlignedB: 20992 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 6656 - LdsOffsetB_Blk: 39424 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 39424 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -7259,15 +6816,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [3, 5] - MIWaveTileA: 3 - MIWaveTileB: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 160 - MacroTileA: 48 - MacroTileB: 160 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7281,28 +6838,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 12 - NumLoadsB: 10 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 5 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7319,11 +6876,11 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB7_NTC0_NTD6_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 StorePriorityOpt: 1 @@ -7334,18 +6891,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 5 - ThreadTileA: 12 - ThreadTileB: 5 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7370,34 +6927,35 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7405,20 +6963,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xpjXnNPQ1cCsbAbLFpk8IuW-bxkhtk3OqekdPjWzUVgM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x1SPQBGNO544lf62ybBPTUgV2soicNbRRY22eNTeioc20= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7447,8 +7005,8 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 1 LSCA: 16 LSCB: 32 LSPA: 8 @@ -7457,33 +7015,33 @@ LVCB: 8 LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27648 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 27648 - LdsNumElementsAlignedA: 6656 - LdsNumElementsAlignedB: 20992 + LdsNumBytes: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 6656 - LdsOffsetB_Blk: 39424 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 39424 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -7495,15 +7053,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [3, 5] - MIWaveTileA: 3 - MIWaveTileB: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 160 - MacroTileA: 48 - MacroTileB: 160 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7517,28 +7075,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 + NonTemporalA: 3 NonTemporalB: 6 NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 12 - NumLoadsB: 10 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 5 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7555,8 +7113,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM48_WGMXCC1_WGMXCCGn1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -7572,16 +7130,16 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 5 - ThreadTileA: 12 - ThreadTileB: 5 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7600,22 +7158,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7624,8 +7182,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -7634,6 +7192,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7641,20 +7200,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xdzK9JUaqXTNoF2Ac01jkgAeGIlt9zYm7jAEtGmpTIwY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x17FGnDg9mjs4H7rgGicbDCmHKT_mE2FoSY2Lgq3YFt7E= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7665,7 +7224,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -7673,7 +7232,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -7683,43 +7242,43 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 64 - LSPA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 0 + LSCA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 8 - LVCB: 16 + LVCA: 16 + LVCB: 8 LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61952 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 61952 - LdsNumElementsAlignedA: 20992 + LdsNumBytes: 28672 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 20992 - LdsOffsetB_Blk: 53760 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20992 - LdsOffsetMetadata_Blk: 53760 - LdsPadA: 16 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -7731,15 +7290,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 2] - MIWaveTileA: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 64 - MacroTileA: 160 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7753,29 +7312,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 - NumLoadsB: 2 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -7783,7 +7342,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7791,13 +7350,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -7807,16 +7366,16 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 20 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -7842,34 +7401,35 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7877,17 +7437,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x64x32_MI16xwgOGIlbHSY-6NN6XLdmF7L-EWnBxY0nUWipOv7yd_Fk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xS8BemcgDmhBdB4lISWaJd3al9jSpienby5xNB2TIiUs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -7897,12 +7457,13 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -7916,75 +7477,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA6_NTB4_NTC0_NTD5_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 32 - LSCB: 64 + LSCB: 32 LSPA: 32 - LSPB: 8 + LSPB: 32 LVCA: 8 - LVCB: 32 + LVCB: 8 LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 35840 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 35840 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 2] - MIWaveTileA: 7 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 64 - MacroTileA: 224 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -7995,21 +7556,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 7 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 - NumLoadsCoalescedA: 7 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -8018,8 +7579,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8027,8 +7588,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 33 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA6_NTB4_NTC0_NTD5_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -8043,53 +7604,54 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 2 - ThreadTileA: 28 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -8098,14 +7660,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8113,7 +7677,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1P79SndavTGvem3QTcxe5avntrlbrnZffKcV66eEHhVU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32MrNnY6-q5-x--_2qLKFIhdkT-VfmsesFKPD_9pFBJKE= BufferLoad: true BufferStore: true CUCount: null @@ -8123,7 +7687,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -8136,91 +7700,92 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB5_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 128 + LSCB: 128 + LSPA: 8 LSPB: 8 - LVCA: 4 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8231,23 +7796,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -8255,7 +7820,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8263,33 +7828,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 34 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB5_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8299,33 +7864,34 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -8334,14 +7900,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8349,7 +7917,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1GRq0J0BIf9_kOmRivWvl-M_wy7XxwpHY5TtqQ7BcIXE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xDLKsQQPRrn0n5Vwsb2N_XxY3gRwpEsX5MH_upKXcePc= BufferLoad: true BufferStore: true CUCount: null @@ -8359,10 +7927,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -8372,67 +7940,68 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -8440,50 +8009,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 1 + NonTemporalA: 5 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -8491,7 +8060,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8499,85 +8068,88 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 35 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8585,20 +8157,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1d8vD8HZkReziZDPDfy_jTNMHwhgwUmOg1B_nL57bLcY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xBlD6lvzcO3AsZHEkbsuV-5HK3LQYPUSYdge36Zz-Tkk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -8608,67 +8180,68 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -8676,50 +8249,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 4 NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -8727,7 +8300,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8735,32 +8308,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 36 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -8771,49 +8344,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8821,7 +8397,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1d0IEB8OKcqBBBF-4yx_CbFvCcHAv99Kn-9PNgMVg4Do= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x64x32_MI32x6vVSC6SDE2KOmLmTsSdREVzn1S-j_c1qld7cPM9ruFo= BufferLoad: true BufferStore: true CUCount: null @@ -8831,7 +8407,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -8844,91 +8420,92 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 + LSCA: 256 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8939,23 +8516,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -8963,7 +8540,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8971,33 +8548,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 37 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9007,7 +8584,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9015,23 +8593,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -9042,14 +8620,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9057,20 +8637,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1wJ0_010zQ1ozDEKij-mIG9KmyLbncyS2dBVtU7Gx8vc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32m8hH9VbP1rsnhmm7x1C7xvs1_vLIeT1cHnec5pXHId8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9080,6 +8660,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 1 @@ -9087,7 +8668,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9096,102 +8677,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 + LSCA: 256 + LSCB: 128 + LSPA: 1 + LSPB: 2 + LVCA: 256 + LVCB: 128 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9199,7 +8780,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9207,85 +8788,88 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 38 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9293,7 +8877,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x15ubexBXumzCQ5oYriKs0bC1QIR5F5xgSZw0beSsLJOA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xngs_elNKDW6m1ocFurAVW1-bILkxN-GRvv7ATNlFbNA= BufferLoad: true BufferStore: true CUCount: null @@ -9303,7 +8887,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -9313,12 +8897,13 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -9326,81 +8911,81 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 8 - LVCA: 4 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9411,31 +8996,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 + NonTemporalA: 3 + NonTemporalB: 2 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9443,14 +9028,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -9458,17 +9043,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -9479,12 +9064,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -9494,18 +9080,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9514,14 +9100,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9529,7 +9117,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1TchkKUPeyaTDsftqLBdXFb97CfgH3KVHDT_kuosD6JY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32P7qsJIhrv78XpyeSC7zBiVxn1KY_eJp5nKJBC0jo2gQ= BufferLoad: true BufferStore: true CUCount: null @@ -9539,7 +9127,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -9552,91 +9140,92 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9647,23 +9236,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9671,7 +9260,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9679,33 +9268,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9715,49 +9304,532 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32dcnL-xT6QilVKSAVmeQtnX8jMTNwUHqUlw-b5wTn7Vg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 256 + LSPA: 4 + LSPB: 1 + LVCA: 64 + LVCB: 256 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 116736 + LdsInitCVgprs: false + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 32 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32lCuOSl64I7neVp6btzXlQuLUgdhwEhhlghPKHMS9l-A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9765,7 +9837,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x1SPQBGNO544lf62ybBPTUgV2soicNbRRY22eNTeioc20= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI327T9q44lPTbgQhD9P5Re1DTYhmb-kIDLWiLzzNZahKdk= BufferLoad: true BufferStore: true CUCount: null @@ -9775,10 +9847,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9788,9 +9860,10 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -9798,108 +9871,108 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 256 + LSPA: 4 + LSPB: 1 + LVCA: 64 + LVCB: 256 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 24 + NumLoadsB: 32 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 32 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9907,7 +9980,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9916,12 +9989,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -9932,26 +10005,27 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9960,22 +10034,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -9984,16 +10058,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10001,7 +10077,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x17FGnDg9mjs4H7rgGicbDCmHKT_mE2FoSY2Lgq3YFt7E= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI32Osu4QqVfcn5r6pIbEerf6Jrxm2K689E53kraS9QV9zc= BufferLoad: true BufferStore: true CUCount: null @@ -10011,10 +10087,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10024,118 +10100,119 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 256 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 4 + LSPB: 2 + LVCA: 32 + LVCB: 128 + LVPA: 4 + LVPB: 1 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 60928 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 + NonTemporalA: 5 + NonTemporalB: 2 NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 12 + NumLoadsB: 16 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -10143,7 +10220,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10152,68 +10229,69 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 96 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 96 ThreadTileB: 2 - TransposeLDS: 0 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10222,14 +10300,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10237,17 +10317,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16x0UDQxayBLn5JqBICPOgS8LV3qdpA-ceEcBdTCX5K32w= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32bI_bdkvPgr0hxWR52yzRVG-SV6EeZroAMXfU2LBV6Cc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -10262,15 +10342,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10280,72 +10360,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 2 + LVCA: 32 + LVCB: 128 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 20480 + LdsBytesNoAmax: 117248 LdsInitCVgprs: false - LdsNumBytes: 20480 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 43008 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 43008 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -10356,22 +10436,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 6 - NonTemporalC: 1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 6 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10380,7 +10460,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10389,38 +10469,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10430,26 +10510,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -10469,6 +10549,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10476,7 +10557,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xS8BemcgDmhBdB4lISWaJd3al9jSpienby5xNB2TIiUs= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI32hi5QaXYYbTLfVo4WYFrf6BFXaTAmKT1q2UzDXxqfsTc= BufferLoad: true BufferStore: true CUCount: null @@ -10486,7 +10567,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -10501,8 +10582,8 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -10519,39 +10600,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSCB: 256 + LSPA: 16 + LSPB: 2 + LVCA: 16 + LVCB: 128 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 123392 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -10559,7 +10640,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -10567,15 +10648,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10595,22 +10676,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 7 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 14 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 10 + NumLoadsB: 16 + NumLoadsCoalescedA: 5 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10628,38 +10709,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 32 + SubGroup1: 128 SubGroupA: 2 - SubGroupB: 32 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 80 + ThreadTile1: 2 + ThreadTileA: 80 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10669,32 +10750,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -10708,6 +10789,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10715,7 +10797,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x128_MI16xAuxWbMqQMhgDMEqwX2zYcF0BxPd1rraVdJXHayyzx80= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32rvSVHgGyc9O8bCX7KPSzyfhaYox1wvz-y6e2df6qooI= BufferLoad: true BufferStore: true CUCount: null @@ -10725,10 +10807,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10740,8 +10822,8 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -10758,98 +10840,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC5_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 128 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 117248 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 5 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 6 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 16 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10858,7 +10940,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10867,12 +10949,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC5_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -10881,24 +10963,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10908,45 +10990,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10954,7 +11037,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32MrNnY6-q5-x--_2qLKFIhdkT-VfmsesFKPD_9pFBJKE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xfjyEPyKcLju337expcYgdW_PYDBxtV--GwqFuufDIHQ= BufferLoad: true BufferStore: true CUCount: null @@ -10964,7 +11047,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -10979,15 +11062,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10997,16 +11080,16 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 128 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 32 LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 @@ -11029,7 +11112,7 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -11037,7 +11120,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -11045,15 +11128,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11073,21 +11156,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 5 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -11106,32 +11189,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 46 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11150,23 +11233,23 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -11175,8 +11258,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -11186,6 +11269,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11193,7 +11277,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xDLKsQQPRrn0n5Vwsb2N_XxY3gRwpEsX5MH_upKXcePc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xZDgg0HrpoVaRxARF92LojQtAHhrsjP4H0KyqEhH-sCk= BufferLoad: true BufferStore: true CUCount: null @@ -11204,9 +11288,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -11218,15 +11302,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -11236,47 +11320,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 99840 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 99840 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -11285,14 +11369,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11306,28 +11390,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 1 + NonTemporalB: 4 + NonTemporalC: 3 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11345,7 +11429,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 47 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -11353,13 +11437,13 @@ StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 4 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -11367,16 +11451,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11386,11 +11470,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -11398,7 +11482,7 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11407,7 +11491,7 @@ _DepthUB: 128 _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -11419,12 +11503,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11432,20 +11517,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xBlD6lvzcO3AsZHEkbsuV-5HK3LQYPUSYdge36Zz-Tkk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xT_oWldoREPLcWaxzrswuGqxafTJ0Dx8ymCrTRmXFnI8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -11457,15 +11542,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -11475,34 +11560,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -11510,8 +11595,8 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -11524,13 +11609,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -11545,27 +11630,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 7 + NonTemporalB: 5 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -11584,21 +11669,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 48 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -11606,9 +11691,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -11625,10 +11710,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -11637,7 +11722,7 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11658,12 +11743,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11671,20 +11757,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x5GkUaAhALWi9UJBFESYvXtYl-6lpndJ9tLANP9gypm0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3TwUtfedIksh229b4jL_RGCbgCNCFvRURgDStOE3W9l4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -11702,7 +11788,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -11714,45 +11800,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -11762,15 +11848,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11784,28 +11870,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalA: 7 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11814,7 +11900,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11823,31 +11909,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 49 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM2_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -11867,23 +11953,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -11892,17 +11978,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11910,7 +11997,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x64x32_MI32x6vVSC6SDE2KOmLmTsSdREVzn1S-j_c1qld7cPM9ruFo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xEv6q-2RrREhV2X2V16Q3urIUlLubt1R_nl9HzjW-1Vo= BufferLoad: true BufferStore: true CUCount: null @@ -11920,7 +12007,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -11953,24 +12040,24 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 256 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 1 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 106496 + LdsNumBytes: 114688 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -11985,7 +12072,7 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -11993,7 +12080,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12001,15 +12088,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 64 - MacroTileA: 256 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12029,22 +12116,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 0 + NonTemporalA: 7 + NonTemporalB: 5 + NonTemporalC: 2 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12062,32 +12149,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 50 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 + StreamKXCCMapping: 8 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12107,22 +12194,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -12142,6 +12229,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12149,17 +12237,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32m8hH9VbP1rsnhmm7x1C7xvs1_vLIeT1cHnec5pXHId8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x3NlLSXz4MTW4u3ogcgbK2QAMEnVRZJPbsZXS-xRtf7U= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -12174,16 +12262,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -12192,72 +12280,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 256 - LSCB: 128 - LSPA: 1 - LSPB: 2 - LVCA: 256 - LVCB: 128 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -12269,21 +12357,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12301,38 +12389,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 51 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -12342,36 +12430,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -12381,6 +12469,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12388,7 +12477,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xngs_elNKDW6m1ocFurAVW1-bILkxN-GRvv7ATNlFbNA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kOSFPyfEiXFMwNZMGIhbzH01Yc6BxtbPYDwewDUScr8= BufferLoad: true BufferStore: true CUCount: null @@ -12398,7 +12487,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -12408,7 +12497,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -12419,7 +12508,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -12431,45 +12520,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 - LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsBytesNoAmax: 57344 + LdsInitCVgprs: false + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -12479,15 +12568,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12507,22 +12596,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 6 NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12530,8 +12619,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12540,31 +12629,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 52 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -12584,23 +12673,23 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -12620,6 +12709,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12627,20 +12717,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32P7qsJIhrv78XpyeSC7zBiVxn1KY_eJp5nKJBC0jo2gQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xw4RgIcqBRFqEmLFOO3XP4aJt222Ys1eV6bag4UczOws= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12658,7 +12748,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -12670,92 +12760,92 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 + NonTemporalA: 3 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -12779,32 +12869,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 53 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12820,28 +12910,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -12853,12 +12943,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12866,7 +12957,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32dcnL-xT6QilVKSAVmeQtnX8jMTNwUHqUlw-b5wTn7Vg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3syvkIi8LYYEv3DFYiM1Iz5gfUUiF9htX1js2hwjk18E= BufferLoad: true BufferStore: true CUCount: null @@ -12876,10 +12967,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12891,13 +12982,13 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -12909,47 +13000,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 128 - LSCB: 256 - LSPA: 4 - LSPB: 1 - LVCA: 64 - LVCB: 256 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12957,15 +13048,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12979,28 +13070,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 5 + NonTemporalB: 7 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 32 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13009,7 +13100,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13018,38 +13109,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 54 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13059,28 +13150,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -13092,12 +13183,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13105,7 +13197,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32lCuOSl64I7neVp6btzXlQuLUgdhwEhhlghPKHMS9l-A= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3fJnt75929EZ_aqQCP_MCPyiNiz8LyKpNQWmHPNmxA6E= BufferLoad: true BufferStore: true CUCount: null @@ -13115,10 +13207,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13130,16 +13222,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -13148,47 +13240,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13196,15 +13288,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13218,22 +13310,22 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -13248,7 +13340,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13257,38 +13349,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 55 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13298,45 +13390,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13344,7 +13437,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI327T9q44lPTbgQhD9P5Re1DTYhmb-kIDLWiLzzNZahKdk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kO-rrc7SP8abCiazbzjpjv970HJ6dMdiUNvfFInMaUg= BufferLoad: true BufferStore: true CUCount: null @@ -13354,10 +13447,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13369,16 +13462,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -13387,47 +13480,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 - LSCB: 256 - LSPA: 4 - LSPB: 1 - LVCA: 64 - LVCB: 256 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13435,15 +13528,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13457,28 +13550,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalB: 7 + NonTemporalC: 6 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 192 - NumLoadsA: 24 - NumLoadsB: 32 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13487,7 +13580,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13496,38 +13589,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 56 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13537,32 +13630,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -13570,12 +13663,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13583,7 +13677,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI32Osu4QqVfcn5r6pIbEerf6Jrxm2K689E53kraS9QV9zc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x7Ue9K6K2Ntf9cWH31tBpMfG-dUbDAAZYidjEukyBkJ8= BufferLoad: true BufferStore: true CUCount: null @@ -13593,7 +13687,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -13608,13 +13702,13 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -13626,72 +13720,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 64 - LSCB: 256 - LSPA: 8 - LSPB: 2 - LVCA: 32 - LVCB: 128 - LVPA: 4 - LVPB: 1 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -13702,22 +13796,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 + NonTemporalA: 3 NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 - NumLoadsB: 16 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13735,38 +13829,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 57 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 2 - ThreadTileA: 96 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13776,26 +13870,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -13815,6 +13909,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13822,7 +13917,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32bI_bdkvPgr0hxWR52yzRVG-SV6EeZroAMXfU2LBV6Cc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3-Y1s2goE5bt3ELzlBLdYEEGXo4CN58GBD1CcuhssmnI= BufferLoad: true BufferStore: true CUCount: null @@ -13832,10 +13927,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13848,12 +13943,12 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -13865,45 +13960,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 256 - LSPA: 8 - LSPB: 2 - LVCA: 32 - LVCB: 128 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -13913,15 +14008,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13935,28 +14030,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 7 - NonTemporalB: 0 + NonTemporalB: 5 NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 16 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13965,7 +14060,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13974,38 +14069,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 58 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14015,26 +14110,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -14043,17 +14138,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14061,7 +14157,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI32hi5QaXYYbTLfVo4WYFrf6BFXaTAmKT1q2UzDXxqfsTc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3jGU7eBWYn2uq06g9WNdz2rJrlsiV08zOFEkl_DheAO8= BufferLoad: true BufferStore: true CUCount: null @@ -14072,9 +14168,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -14086,16 +14182,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -14104,47 +14200,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 + LDSTrInst: 1 LSCA: 32 - LSCB: 256 - LSPA: 16 - LSPB: 2 - LVCA: 16 - LVCB: 128 - LVPA: 8 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -14152,15 +14248,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14174,29 +14270,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 10 - NumLoadsB: 16 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14204,7 +14300,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14213,12 +14309,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 59 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -14227,24 +14323,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 2 - ThreadTileA: 80 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14258,15 +14354,15 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 2, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14279,20 +14375,21 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14300,7 +14397,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32rvSVHgGyc9O8bCX7KPSzyfhaYox1wvz-y6e2df6qooI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xNwlw2eRLckxkCN-CUwNSdJhWdS3ncTyQtpMiviPnWD0= BufferLoad: true BufferStore: true CUCount: null @@ -14310,7 +14407,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -14325,13 +14422,13 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -14343,72 +14440,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 256 - LSCB: 128 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 1 - LVPB: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14419,22 +14516,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 16 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14452,38 +14549,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 60 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14493,32 +14590,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -14532,6 +14629,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14539,7 +14637,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1Nl77NbNO2lSWWQtfCc4QHlGwND2wKaigFRVChGJrByI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x128_MI16xMeVY6M4rG08B1L31ecGGxHsYBMoyD7ClC_9Q_YvLxrE= BufferLoad: true BufferStore: true CUCount: null @@ -14549,7 +14647,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -14564,16 +14662,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -14582,39 +14680,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 16 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -14631,13 +14729,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -14658,23 +14756,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 + NumElementsPerBatchStore: 4 NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 8 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14691,7 +14789,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 61 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -14701,11 +14799,11 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 5 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -14713,9 +14811,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -14735,23 +14833,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -14771,6 +14869,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14778,7 +14877,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x1Ny3uix0FS1YjQCNJvjDQR_jNseIyG2aDoH3JXwKer5Q= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3QNX14jqPW75Zz3vlZTY-T_2lUZsf-R8ZcuULerVLK3c= BufferLoad: true BufferStore: true CUCount: null @@ -14804,7 +14903,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -14812,7 +14911,7 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -14821,34 +14920,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -14858,11 +14957,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -14874,19 +14973,19 @@ MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14898,21 +14997,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 6 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalB: 5 + NonTemporalC: 7 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14921,7 +15020,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14930,7 +15029,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 62 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -14945,16 +15044,16 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -14971,7 +15070,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -14981,7 +15080,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 @@ -14992,15 +15091,15 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -15010,6 +15109,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15017,7 +15117,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xfjyEPyKcLju337expcYgdW_PYDBxtV--GwqFuufDIHQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x32AltWgSZdqdwx7GzSAOhOKSF6ItaUmcsthBASNoxcv0= BufferLoad: true BufferStore: true CUCount: null @@ -15027,7 +15127,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -15042,7 +15142,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -15050,7 +15150,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -15060,39 +15160,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -15108,15 +15208,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [1, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15136,21 +15236,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 4 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -15169,13 +15269,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 63 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -15185,9 +15285,9 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 2 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -15210,7 +15310,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -15220,16 +15320,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -15249,6 +15349,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15256,20 +15357,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xZDgg0HrpoVaRxARF92LojQtAHhrsjP4H0KyqEhH-sCk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x128_MI16xJ4y05CuBpGc3taFnt7pzYk0hSNiMOEMi0fG9NiHuLBg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -15281,15 +15382,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -15299,48 +15400,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 124928 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 124928 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -15348,49 +15449,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 4 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 6 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15408,38 +15509,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 64 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM24_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 6 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -15452,16 +15553,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -15477,17 +15578,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15495,7 +15597,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xT_oWldoREPLcWaxzrswuGqxafTJ0Dx8ymCrTRmXFnI8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3ujNlCct7p7sebpSjxt4sZn01WDpJaiKS9omFcrPkkDA= BufferLoad: true BufferStore: true CUCount: null @@ -15505,7 +15607,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -15526,7 +15628,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: false @@ -15538,16 +15640,16 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 @@ -15575,8 +15677,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -15587,14 +15689,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15615,15 +15717,15 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -15638,7 +15740,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15647,21 +15749,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 65 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 4 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -15669,10 +15771,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15688,11 +15790,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -15700,14 +15802,14 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -15716,8 +15818,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -15727,6 +15829,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15734,7 +15837,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3TwUtfedIksh229b4jL_RGCbgCNCFvRURgDStOE3W9l4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3nzu6Qyih_cejTkfbKQ9cp0TJkTsESgoR9kXX96zNFYU= BufferLoad: true BufferStore: true CUCount: null @@ -15759,7 +15862,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -15767,7 +15870,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -15777,24 +15880,24 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57344 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -15817,7 +15920,7 @@ LoopIters: 1 LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -15826,14 +15929,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15853,22 +15956,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 7 + NonTemporalA: 6 + NonTemporalB: 6 NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15886,21 +15989,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 66 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM2_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 4 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -15909,9 +16012,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15927,19 +16030,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -15948,7 +16051,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -15966,6 +16069,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15973,7 +16077,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xsFKY_CSTrHV005ozWZtl7nzj65J_jfe3myJtWO2Gg5M= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xO4XNUjHGrHqgODUpLv423LWJHZo8XNmE2QQyUHs4Hp4= BufferLoad: true BufferStore: true CUCount: null @@ -16016,7 +16120,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 16 LSCB: 16 @@ -16092,14 +16196,14 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 + NumElementsPerBatchStore: 14 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 @@ -16125,12 +16229,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 67 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -16139,7 +16243,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 4 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -16166,7 +16270,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -16178,7 +16282,7 @@ WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -16205,6 +16309,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16212,246 +16317,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xEv6q-2RrREhV2X2V16Q3urIUlLubt1R_nl9HzjW-1Vo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 - LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 68 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3AVvuY4XB6qJXA1Io7C9SUPUCag0Hk7KDL3-9TPD8mCI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3ZSMvGjIMLw0FMc1VgvEQMnZXoXvLulLF3Zm4sqeDSZY= BufferLoad: true BufferStore: true CUCount: null @@ -16461,7 +16327,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -16494,39 +16360,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 8 + LSPA: 4 + LSPB: 4 LVCA: 32 LVCB: 32 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -16534,7 +16400,7 @@ LoopIters: 1 LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16570,23 +16436,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 7 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -16602,14 +16468,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 69 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -16617,7 +16483,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -16654,16 +16520,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 2, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -16683,6 +16549,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16690,7 +16557,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x3NlLSXz4MTW4u3ogcgbK2QAMEnVRZJPbsZXS-xRtf7U= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Kw-SZQAtmHpopscjyAUHOSiylfaQV66yfpX_pR5KxNc= BufferLoad: true BufferStore: true CUCount: null @@ -16700,10 +16567,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -16715,15 +16582,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -16733,48 +16600,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -16782,48 +16649,48 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 0 + NonTemporalA: 4 + NonTemporalB: 6 + NonTemporalC: 7 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -16833,7 +16700,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16841,33 +16708,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 70 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16883,28 +16750,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16916,12 +16783,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16929,7 +16797,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3v3FiLk7zthz6BecsfG_ytiGrAKMvyDv3YQNVHChxGm8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI32x3LhbYlim10ZqOk0nP8fXHOi_Hi7y_lAu7NG8aWiU96lc= BufferLoad: true BufferStore: true CUCount: null @@ -16939,7 +16807,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -16954,7 +16822,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -16962,7 +16830,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -16972,24 +16840,24 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 4 LSPB: 8 - LVCA: 8 + LVCA: 64 LVCB: 32 - LVPA: 8 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 32768 + LdsNumBytes: 28672 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 @@ -17004,7 +16872,7 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -17012,7 +16880,7 @@ LoopIters: 1 LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17020,14 +16888,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -17048,22 +16916,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 6 - NonTemporalC: 6 + NonTemporalA: 5 + NonTemporalB: 5 + NonTemporalC: 5 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17080,14 +16948,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 71 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -17096,9 +16964,9 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -17132,16 +17000,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 2, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -17150,17 +17018,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17168,7 +17037,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3ztAWOuJIg4hj-4ujEuGL1U8M4eXirObR_poPSQp0OOM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI327SkXZTgVHkX5LyMuBLgxMYBdZ66v8HJttMYA7hEwXj0= BufferLoad: true BufferStore: true CUCount: null @@ -17178,10 +17047,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -17199,7 +17068,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17211,45 +17080,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 64 + LSPA: 1 + LSPB: 4 + LVCA: 256 + LVCB: 64 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -17259,15 +17128,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17281,27 +17150,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 3 + NonTemporalB: 6 + NonTemporalC: 6 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 32 + NumLoadsB: 24 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 32 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -17311,7 +17180,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17319,39 +17188,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 72 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -17364,23 +17233,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -17389,17 +17258,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17407,7 +17277,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3Foy5GH4mHSASXOOWFvkX6kZ_MyjRbIDODNqQU7kwR5k= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xfBh1DNfo1fawGWm7X3qaY_5Squ83EfDfOEAKMCzM9Rk= BufferLoad: true BufferStore: true CUCount: null @@ -17417,7 +17287,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -17432,15 +17302,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -17450,39 +17320,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 LSCB: 64 - LSPA: 32 + LSPA: 4 LSPB: 16 - LVCA: 8 + LVCA: 64 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -17498,14 +17368,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -17526,22 +17396,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 + NonTemporalA: 5 + NonTemporalB: 1 NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17558,33 +17428,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 73 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 5 SubGroup0: 2 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17600,36 +17470,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -17639,6 +17509,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17646,7 +17517,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kOSFPyfEiXFMwNZMGIhbzH01Yc6BxtbPYDwewDUScr8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32eYnwK6Kei8GLtUjTz4SwAJN4sTgnevpcBqDgn7APuNc= BufferLoad: true BufferStore: true CUCount: null @@ -17656,7 +17527,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -17689,24 +17560,24 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57344 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -17721,15 +17592,15 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17737,15 +17608,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17765,22 +17636,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17789,7 +17660,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17797,33 +17668,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 74 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17843,22 +17714,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -17878,6 +17749,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17885,7 +17757,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xw4RgIcqBRFqEmLFOO3XP4aJt222Ys1eV6bag4UczOws= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xcTh1OosyONIB0_BR79JtowSiOCUl1eOmBo7W6kJ1IEI= BufferLoad: true BufferStore: true CUCount: null @@ -17928,7 +17800,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 LSCA: 16 LSCB: 16 @@ -17968,7 +17840,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18004,14 +17876,14 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 0 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 2 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 + NumElementsPerBatchStore: 2 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 @@ -18036,8 +17908,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 75 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -18051,7 +17923,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -18078,7 +17950,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -18090,7 +17962,7 @@ WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -18099,7 +17971,7 @@ _DepthUB: 256 _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -18117,6 +17989,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18124,7 +17997,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3mCH6mtXuBM9CjeHzE0J8gmubeKpBlXAZcQUfI9LyIOk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32Gf-ZOVIhDoAfhdLX7QuucetgGovzU9cyt52o0H5akKE= BufferLoad: true BufferStore: true CUCount: null @@ -18134,10 +18007,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18150,12 +18023,12 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -18167,47 +18040,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18215,15 +18088,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18237,28 +18110,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 2 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18275,8 +18148,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 76 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -18286,28 +18159,28 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 + StreamKXCCMapping: 6 + SubGroup0: 4 SubGroup1: 64 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -18320,23 +18193,23 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -18345,17 +18218,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18363,20 +18237,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3syvkIi8LYYEv3DFYiM1Iz5gfUUiF9htX1js2hwjk18E= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT352x160x32_MI16ccTYseDAGn1oJXLhoZQThQNde5MmK0F8g1oeV6Ugr6Y= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18389,7 +18263,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -18397,7 +18271,7 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -18406,98 +18280,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 8 + LSPB: 32 LVCA: 8 - LVCB: 32 + LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 5632 + LdsBlockSizePerPadB: 2560 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 45568 + LdsNumElementsAlignedB: 20992 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 112128 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 112128 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [11, 5] + MIWaveTileA: 11 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 352 + MacroTile1: 160 + MacroTileA: 352 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 220 + NumGlobalWriteVectorsPerThread: 220 + NumLoadsA: 11 + NumLoadsB: 5 + NumLoadsCoalescedA: 11 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18514,33 +18388,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 77 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 + StreamKXCCMapping: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 44 + ThreadTile1: 5 + ThreadTileA: 44 + ThreadTileB: 5 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18566,22 +18440,22 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -18589,12 +18463,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18602,20 +18477,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3fJnt75929EZ_aqQCP_MCPyiNiz8LyKpNQWmHPNmxA6E= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x96x32_MI16x1LpcRZaaV-Af8k2PL8BZusz8LZyRttzVO0p6xf9capGw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18627,16 +18502,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -18645,98 +18520,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1536 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 58368 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 58368 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 12800 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 45568 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 45568 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 3 NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18745,7 +18620,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18753,33 +18628,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 78 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 + StreamKXCCMapping: 8 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18795,7 +18670,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -18805,22 +18680,22 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -18828,12 +18703,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18841,7 +18717,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kO-rrc7SP8abCiazbzjpjv970HJ6dMdiUNvfFInMaUg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1NQx2dS7HxM2pbst2fnrvQsmNkjm_6CZ0Yyr4B0RyrYE= BufferLoad: true BufferStore: true CUCount: null @@ -18872,7 +18748,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -18884,7 +18760,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 LSCB: 32 @@ -18894,8 +18770,8 @@ LVCB: 8 LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 57344 LdsInitCVgprs: false @@ -18916,26 +18792,26 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 32 @@ -18946,10 +18822,10 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -18960,16 +18836,16 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 6 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 3 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 + NumElementsPerBatchStore: 16 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -18984,7 +18860,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18992,33 +18868,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 79 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC8_WGMXCCGn1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19034,19 +18910,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -19055,11 +18931,11 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -19073,6 +18949,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19080,7 +18957,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x7Ue9K6K2Ntf9cWH31tBpMfG-dUbDAAZYidjEukyBkJ8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3GzPpMCCg6eosyo0J9SwQ5vW0imarFZnMlHiOChdob7M= BufferLoad: true BufferStore: true CUCount: null @@ -19090,10 +18967,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -19105,15 +18982,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -19123,48 +19000,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -19172,48 +19049,48 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 3 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 2 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -19223,7 +19100,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19231,33 +19108,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 80 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19276,23 +19153,23 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -19301,17 +19178,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19319,7 +19197,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3-Y1s2goE5bt3ELzlBLdYEEGXo4CN58GBD1CcuhssmnI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3NSZxol_RK3ulhtjKNVM4wmncpvgvfydSWBZJx3QPOQ4= BufferLoad: true BufferStore: true CUCount: null @@ -19344,15 +19222,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -19362,24 +19240,24 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57344 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -19394,15 +19272,15 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19410,15 +19288,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19438,22 +19316,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 6 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19462,7 +19340,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19470,32 +19348,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 81 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -19515,14 +19393,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 @@ -19540,8 +19418,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -19551,6 +19429,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19558,7 +19437,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3jGU7eBWYn2uq06g9WNdz2rJrlsiV08zOFEkl_DheAO8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32st4uOxQc9vEhWZhR3gpfTesG3QAsHuiqHPy0f-iqroE= BufferLoad: true BufferStore: true CUCount: null @@ -19569,9 +19448,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -19583,16 +19462,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -19601,47 +19480,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 256 LSPA: 4 LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19649,15 +19528,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19671,29 +19550,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -19701,7 +19580,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19709,39 +19588,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 82 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -19751,19 +19630,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -19772,24 +19651,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19797,7 +19677,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32UDSWT1bfvHLxaf8dJ8T0eRbOPTSnBnL-fVZx5Ms-LhM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI328rtlOkIRB2U8sv7gASwl7nDep88FFUev28ZzQZ-C8MY= BufferLoad: true BufferStore: true CUCount: null @@ -19808,9 +19688,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -19822,13 +19702,13 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -19840,47 +19720,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 256 + LSCB: 256 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19889,14 +19769,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19910,28 +19790,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 7 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 4 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 16 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19948,22 +19828,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 83 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 + StreamKXCCMapping: 4 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -19971,16 +19851,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -19990,11 +19870,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -20002,7 +19882,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -20023,12 +19903,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20036,12 +19917,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x192x32_MI32fns-FsA_JlqfawCOHWwiIpnsG-TX8DiAPNFhSAbfy1c= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1G3yMdb7BIPmH0-pBdSqMzeJXAKcONTAp7QuSBNzCeH4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -20067,7 +19948,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -20079,36 +19960,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LVCA: 8 + LVCB: 8 + LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 9728 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 24576 + LdsNumBytes: 9728 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -20116,35 +19997,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -20155,23 +20036,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 + NonTemporalA: 5 + NonTemporalB: 7 NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 6 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20179,7 +20060,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20187,39 +20068,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 84 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC32_WGMXCCGn1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 0 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -20229,19 +20110,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -20250,7 +20131,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -20264,10 +20145,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20275,20 +20157,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3X-wpSRHzrp-MeuOcz4MBfcmq-O8PWo_CDo9EAFFm4zI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1d95FnwSepCpQ1VNN-c9bIuWmSq8NCHH4Yv9ZeAeUB5U= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20306,7 +20188,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -20318,34 +20200,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 8192 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -20353,64 +20235,64 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20426,33 +20308,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 85 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20468,26 +20350,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -20496,17 +20378,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20514,20 +20397,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x32x32_MI32xYjM8KzEWXOVAnYjY1guEsDCr3YdYN-J2M3Jt09-isIo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1vdET2FEVxtnfxZJASjE4Mpu0qbCqFDyzkii6x1dbHPs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20539,16 +20422,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -20557,34 +20440,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 32 - LSPA: 8 + LSCA: 32 + LSCB: 64 + LSPA: 4 LSPB: 8 LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53248 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 53248 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -20592,64 +20475,64 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 6 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 12 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20657,7 +20540,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20665,8 +20548,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 86 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -20675,23 +20558,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20707,17 +20590,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -20728,7 +20611,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -20740,12 +20623,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20753,17 +20637,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xNwlw2eRLckxkCN-CUwNSdJhWdS3ncTyQtpMiviPnWD0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1XVsYbgnD3R965KnpJIJjvIue0CI2SvPxRYsIPhAomzo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -20778,16 +20662,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -20796,45 +20680,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 12800 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 12800 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -20844,15 +20728,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20873,21 +20757,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 4 + NonTemporalB: 6 + NonTemporalC: 7 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20904,32 +20788,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 87 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -20946,45 +20830,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20992,40 +20877,40 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x128_MI16xMeVY6M4rG08B1L31ecGGxHsYBMoyD7ClC_9Q_YvLxrE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xk1pA2wRYcqyCR46mTdGjDlhdTU15it1Jh10auM8T_po= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -21035,47 +20920,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 4096 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 73728 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21083,15 +20968,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21105,28 +20990,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21134,7 +21019,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -21143,8 +21028,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 88 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -21154,22 +21039,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21188,23 +21073,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -21218,12 +21103,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21231,7 +21117,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3QNX14jqPW75Zz3vlZTY-T_2lUZsf-R8ZcuULerVLK3c= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xWktQ7pkAc4eXErbVdZeJISaskwk40DfNVmN4lX1TIVk= BufferLoad: true BufferStore: true CUCount: null @@ -21241,7 +21127,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -21256,15 +21142,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -21274,16 +21160,16 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 @@ -21306,7 +21192,7 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -21323,14 +21209,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21350,23 +21236,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 5 + NonTemporalA: 4 + NonTemporalB: 4 NonTemporalC: 7 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -21382,22 +21268,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 89 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 6 SubGroup0: 2 SubGroup1: 64 SubGroupA: 2 @@ -21405,10 +21291,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21427,23 +21313,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -21461,8 +21347,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21470,7 +21357,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x32AltWgSZdqdwx7GzSAOhOKSF6ItaUmcsthBASNoxcv0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16x_DAw4csScAbaMhQdK6dmv9jQOzE5sE-cLR2YRUIrRvU= BufferLoad: true BufferStore: true CUCount: null @@ -21480,7 +21367,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -21513,34 +21400,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 1 - LSCA: 32 - LSCB: 64 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPB: 32 + LVCA: 4 + LVCB: 4 LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 57344 + LdsNumBytes: 32768 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 16384 LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -21551,34 +21438,34 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -21589,23 +21476,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -21621,13 +21510,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 90 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -21637,16 +21526,16 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -21661,9 +21550,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -21673,18 +21565,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -21698,10 +21590,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21709,17 +21602,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x128_MI16xJ4y05CuBpGc3taFnt7pzYk0hSNiMOEMi0fG9NiHuLBg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x48x64_MI16x1MqDWekebqrvXnO-Sb3hzu3z0gjkhgQRF_inqkjHmDjg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -21729,7 +21622,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -21740,7 +21633,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -21752,45 +21645,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 + LSCA: 32 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 LVCB: 4 - LVPA: 4 - LVPB: 16 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124928 + LdsBytesNoAmax: 22528 LdsInitCVgprs: false - LdsNumBytes: 124928 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -21801,13 +21694,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [4, 3] - MIWaveTileA: 4 + MIWaveTile: [2, 3] + MIWaveTileA: 2 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 48 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -21828,30 +21721,32 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -21860,22 +21755,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 91 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM24_WGMXCC8_WGMXCCGn1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM24_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -21883,9 +21778,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 3 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 3 TransposeLDS: 2 TransposeLDSMetadata: true @@ -21900,28 +21795,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 4, 1] WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -21930,8 +21828,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -21939,8 +21837,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21948,7 +21847,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x128x64_MI32xakdnwiFEzjPGZlWXrSw6ljoapBVWX_1-wqDBKJ3S68c= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xgmSt2MoRmL0g9WeUHU2DTu1EingvlwMUOcXgdGK6Qso= BufferLoad: true BufferStore: true CUCount: null @@ -21958,9 +21857,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false @@ -21973,7 +21872,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -21981,7 +21880,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -21991,45 +21890,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA5_NTB7_NTC5_NTD0_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 LSCA: 32 - LSCB: 128 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 8 - LVPB: 2 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 122880 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 122880 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -22039,15 +21938,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22061,29 +21960,31 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 5 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22099,13 +22000,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 92 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA5_NTB7_NTC5_NTD0_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -22114,17 +22015,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 48 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -22139,6 +22040,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 @@ -22151,16 +22055,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -22169,17 +22073,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22187,7 +22092,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xhegbm5IjE5kpDKAXXbBgVDqQA9_3kFW7cQz-30MgRCE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xcj81g2ldg0_1p4EGvdXPUPXmYszqzbqqiK6W7fsyN7o= BufferLoad: true BufferStore: true CUCount: null @@ -22230,24 +22135,24 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB1_NTC7_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 131072 + LdsNumBytes: 114688 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -22270,7 +22175,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22279,14 +22184,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22307,22 +22212,24 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 7 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22338,22 +22245,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 93 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB1_NTC7_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -22362,9 +22269,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22378,21 +22285,24 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -22417,8 +22327,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22426,7 +22337,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xA5Q2WtheKtu6QMCmv4Znv6HCAbvakRTCv3w6SKODVK4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68GNy3DOANu_i2VXAB1Y2BeloAMEwWh68Ero33soSvMM= BufferLoad: true BufferStore: true CUCount: null @@ -22436,27 +22347,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -22466,50 +22377,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB7_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22517,14 +22428,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [1, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -22537,7 +22448,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -22545,22 +22456,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22569,7 +22481,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22577,28 +22489,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 94 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB7_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 + StreamKXCCMapping: 0 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 ThreadTile1: 2 @@ -22619,7 +22532,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -22629,26 +22542,26 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -22656,8 +22569,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22665,7 +22579,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3ujNlCct7p7sebpSjxt4sZn01WDpJaiKS9omFcrPkkDA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6wJcIFiIXcL9xRCcdjHOdxHUlrFCn9hAkB7Ii07m5Oeo= BufferLoad: true BufferStore: true CUCount: null @@ -22675,67 +22589,67 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -22745,10 +22659,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22757,14 +22671,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22776,7 +22690,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -22784,22 +22698,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22808,7 +22723,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22816,33 +22731,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 95 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22862,7 +22778,7 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -22870,22 +22786,22 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -22897,6 +22813,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22904,7 +22821,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3nzu6Qyih_cejTkfbKQ9cp0TJkTsESgoR9kXX96zNFYU= BufferLoad: true BufferStore: true CUCount: null @@ -22914,67 +22830,67 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 + LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -22984,10 +22900,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23015,7 +22931,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -23023,22 +22939,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 6 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 + NumElementsPerBatchStore: 8 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23047,7 +22964,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23055,28 +22972,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 96 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 ThreadTile1: 2 @@ -23097,7 +23015,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -23108,23 +23026,23 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -23134,8 +23052,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23143,7 +23062,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1DkE1Pm7B7tu5CU4zsZYTE5zyVc3w0JNiRD9vw48lz-M= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6MFyMQpdiVOcqKVaa7dbJIhJPdEmQ0U9Tqd7zT-vG3WY= BufferLoad: true BufferStore: true CUCount: null @@ -23157,63 +23076,63 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -23223,38 +23142,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -23262,23 +23181,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -23286,7 +23206,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23294,33 +23214,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 97 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23336,36 +23257,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -23373,8 +23294,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23382,7 +23304,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3iPkC1ju8KB9TUVlQezZnhwLo83WpXzjn_z8hDH25qMs= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3nYUFHmrSHNp-n-T71OSxpo2-QMWXfro-CzcYssiDH2w= BufferLoad: true BufferStore: true CUCount: null @@ -23392,80 +23314,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB5_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 LSCB: 32 LSPA: 8 LSPB: 32 LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 114688 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 114688 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23473,15 +23395,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23493,30 +23415,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 5 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23525,7 +23448,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23533,37 +23456,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 98 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB5_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -23585,35 +23509,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23621,7 +23546,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3t4qDciKFGFI7k9IDTRUUOln0r0cA4pct9CJVaPHEfKI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VJH_RBajY8L_kN5xvMaq7RQvJq3SsD2MoUk710Se8-I= BufferLoad: true BufferStore: true CUCount: null @@ -23631,80 +23556,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 LSPA: 8 - LSPB: 32 + LSPB: 16 LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 73728 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 147456 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 147456 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23712,15 +23637,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23732,30 +23657,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsCoalescedB: 7 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23764,7 +23690,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23772,33 +23698,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 99 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23814,45 +23741,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23860,7 +23788,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xO4XNUjHGrHqgODUpLv423LWJHZo8XNmE2QQyUHs4Hp4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6rqfD3ghhAs2e9z1zuI_aBgcgETHE2YH-hIjPMwl5H1A= BufferLoad: true BufferStore: true CUCount: null @@ -23870,28 +23798,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -23900,101 +23828,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24011,32 +23940,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 100 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -24053,45 +23983,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24099,7 +24030,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3ZSMvGjIMLw0FMc1VgvEQMnZXoXvLulLF3Zm4sqeDSZY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6M8k0zgCr79GezsF4koVKCwmzswSUcG00QTbkQGZOzc4= BufferLoad: true BufferStore: true CUCount: null @@ -24109,80 +24040,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24191,14 +24122,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24210,7 +24141,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -24218,15 +24149,15 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 6 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 8 @@ -24234,7 +24165,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24242,7 +24174,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24250,19 +24182,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 101 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -24272,11 +24204,12 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24292,34 +24225,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -24329,8 +24262,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24338,7 +24272,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Kw-SZQAtmHpopscjyAUHOSiylfaQV66yfpX_pR5KxNc= BufferLoad: true BufferStore: true CUCount: null @@ -24352,76 +24285,76 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 + LdsNumBytes: 114688 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24429,15 +24362,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [1, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24449,7 +24382,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -24457,22 +24390,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 6 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24481,7 +24415,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24489,28 +24423,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 102 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 2 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 ThreadTile1: 2 @@ -24541,24 +24476,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -24566,10 +24501,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24577,7 +24513,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3W8Z5ZSNH64DOPbOnJWe8w1zi1tozbubXkwtbGedSVPo= BufferLoad: true BufferStore: true CUCount: null @@ -24587,27 +24522,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -24617,11 +24552,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB4_NTC6_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 LSCA: 64 LSCB: 32 LSPA: 16 @@ -24633,21 +24568,21 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -24655,12 +24590,12 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24688,30 +24623,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 4 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 + NumElementsPerBatchStore: 8 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24720,7 +24656,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24728,19 +24664,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 103 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB4_NTC6_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -24750,6 +24686,7 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 ThreadTile1: 1 @@ -24758,7 +24695,7 @@ TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -24770,7 +24707,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -24781,34 +24718,35 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24816,7 +24754,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI32x3LhbYlim10ZqOk0nP8fXHOi_Hi7y_lAu7NG8aWiU96lc= BufferLoad: true BufferStore: true CUCount: null @@ -24826,80 +24763,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24907,15 +24844,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24927,7 +24864,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -24935,22 +24872,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 5 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24959,7 +24897,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24967,33 +24905,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 104 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25009,45 +24948,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25055,7 +24995,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x33yLL0SJKE2m7wSkwpjfsRBbNJmY03s6tqg4A_rIjS2Q= BufferLoad: true BufferStore: true CUCount: null @@ -25065,28 +25004,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -25095,50 +25034,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB4_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -25146,14 +25085,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -25166,30 +25105,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25206,37 +25146,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 105 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB4_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -25248,45 +25189,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25294,7 +25236,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI327SkXZTgVHkX5LyMuBLgxMYBdZ66v8HJttMYA7hEwXj0= BufferLoad: true BufferStore: true CUCount: null @@ -25304,72 +25245,72 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 64 - LSPA: 1 - LSPB: 4 - LVCA: 256 - LVCB: 64 - LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 98304 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 98304 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 49152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 163840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -25377,7 +25318,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -25385,15 +25326,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25405,7 +25346,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -25413,22 +25354,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 32 - NumLoadsB: 24 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25445,39 +25387,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 106 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 + ThreadTile0: 32 ThreadTile1: 3 - ThreadTileA: 64 + ThreadTileA: 32 ThreadTileB: 3 - TransposeLDS: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -25487,36 +25430,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -25524,8 +25467,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25533,7 +25477,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32LJJdRcK9uX5siV1xSWRUHMOlzu2hp3_CNfkuDSNvJTo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Yk7kOAJB_aE6VWqrVWf__KLZBXdHS4G4A_wz_1sHzCU= BufferLoad: true BufferStore: true CUCount: null @@ -25543,28 +25487,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -25573,102 +25517,103 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 7 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25684,37 +25629,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 107 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -25729,42 +25675,43 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25772,7 +25719,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xfBh1DNfo1fawGWm7X3qaY_5Squ83EfDfOEAKMCzM9Rk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3to8cjWEs6y4SguQB5Eo9GwEWnTleft3p6-QhBINReWU= BufferLoad: true BufferStore: true CUCount: null @@ -25786,104 +25733,104 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -25891,22 +25838,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25923,32 +25871,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 108 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC16_WGMXCCGn1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -25965,34 +25914,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -26002,8 +25951,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26011,7 +25961,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32eYnwK6Kei8GLtUjTz4SwAJN4sTgnevpcBqDgn7APuNc= BufferLoad: true BufferStore: true CUCount: null @@ -26021,27 +25970,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -26051,75 +26000,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -26130,21 +26079,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -26162,33 +26111,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 109 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26204,34 +26154,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -26241,8 +26191,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26250,37 +26201,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xcTh1OosyONIB0_BR79JtowSiOCUl1eOmBo7W6kJ1IEI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3eZ5RqKxB1O3236AeikNxnkmOhdKPC4D_VFkvtbj1K28= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -26290,50 +26241,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26341,15 +26292,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26361,30 +26312,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26401,33 +26353,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 110 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26447,41 +26400,42 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26489,7 +26443,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1SuDoxsto8y0N6ju4XPIRoswylaI8Zz1_dr095Qvnb80= BufferLoad: true BufferStore: true CUCount: null @@ -26499,27 +26452,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -26529,27 +26482,27 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA4_NTB0_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62464 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 62464 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 13312 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -26561,18 +26514,18 @@ LdsOffsetMetadata: 16384 LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26580,15 +26533,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26600,9 +26553,9 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -26610,20 +26563,21 @@ NonTemporal: -1 NonTemporalA: 4 NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26640,33 +26594,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 111 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA4_NTB0_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26682,45 +26637,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false - tailLoopOptB: true + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26728,7 +26684,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32Gf-ZOVIhDoAfhdLX7QuucetgGovzU9cyt52o0H5akKE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WumQmrRZO9w1nG8nVI-z2VGL_a-D9ZwkiDz46zvf_bQ= BufferLoad: true BufferStore: true CUCount: null @@ -26738,28 +26694,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -26768,102 +26724,103 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26879,39 +26836,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 112 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -26921,45 +26879,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26967,38 +26926,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT352x160x32_MI16ccTYseDAGn1oJXLhoZQThQNde5MmK0F8g1oeV6Ugr6Y= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3oeOmeGqaW_jaSFGz_jm6w84YNB03l0N-hq9s6Jgreqc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -27007,50 +26966,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 5632 - LdsBlockSizePerPadB: 2560 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 45568 - LdsNumElementsAlignedB: 20992 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 45568 - LdsOffsetB_Blk: 112128 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45568 - LdsOffsetMetadata_Blk: 112128 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27058,15 +27017,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [11, 5] - MIWaveTileA: 11 - MIWaveTileB: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 352 - MacroTile1: 160 - MacroTileA: 352 - MacroTileB: 160 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27080,28 +27039,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 220 - NumGlobalWriteVectorsPerThread: 220 - NumLoadsA: 11 - NumLoadsB: 5 - NumLoadsCoalescedA: 11 - NumLoadsCoalescedB: 5 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27110,7 +27069,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27118,33 +27077,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 113 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 44 - ThreadTile1: 5 - ThreadTileA: 44 - ThreadTileB: 5 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27160,45 +27120,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27206,38 +27167,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x96x32_MI16x1LpcRZaaV-Af8k2PL8BZusz8LZyRttzVO0p6xf9capGw= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT348doMK9QEbWRou0upuvJNPJpi_evI_jhEzrHZU7JHfw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -27246,50 +27207,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1536 - LdsBlockSizePerPadB: 1536 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58368 + LdsBytesNoAmax: 53248 LdsInitCVgprs: false - LdsNumBytes: 58368 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 12800 + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 45568 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 45568 - LdsPadA: 16 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27297,15 +27258,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27325,22 +27286,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27357,33 +27318,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 114 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27402,31 +27364,31 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -27436,8 +27398,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27445,7 +27408,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32sjq3VlWgDexdtB30zdaxZHZ2177W8NQHz9MJk1SRjvM= BufferLoad: true BufferStore: true CUCount: null @@ -27455,27 +27417,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -27485,75 +27447,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -27564,22 +27526,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27596,32 +27558,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 115 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -27638,7 +27601,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -27648,24 +27611,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -27675,8 +27638,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27684,7 +27648,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32CT5g2e72DPa6AoSaCz5CFHb-qxacyGO3666s_0uERTo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjgFvuyb2NeEcS1wEFnStawG246sqNqAes5-pKzM548= BufferLoad: true BufferStore: true CUCount: null @@ -27694,28 +27658,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -27724,37 +27688,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -27764,38 +27728,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -27803,23 +27767,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -27835,33 +27800,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 116 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27880,33 +27846,33 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -27914,8 +27880,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27923,7 +27890,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1NQx2dS7HxM2pbst2fnrvQsmNkjm_6CZ0Yyr4B0RyrYE= BufferLoad: true BufferStore: true CUCount: null @@ -27933,28 +27899,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -27963,42 +27929,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -28006,7 +27972,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -28014,15 +27980,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28042,22 +28008,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28074,33 +28040,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 117 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28116,34 +28083,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -28153,8 +28120,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28162,7 +28130,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3T2TXD7dyYbmbyNhlrtkshdt0UCqzHK0XZwxVfT5KmII= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3-LSPS0cmSEhfqGL2SuyEC3Fz2PkCkvfzxBANwy6XxN4= BufferLoad: true BufferStore: true CUCount: null @@ -28172,27 +28140,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -28202,21 +28170,21 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 65536 LdsInitCVgprs: false @@ -28237,43 +28205,43 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -28281,22 +28249,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28313,33 +28282,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 118 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28355,36 +28325,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -28392,8 +28362,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28401,7 +28372,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3nB-JC3_AX_KH5Q9eyL9-Q4p4zkAhZW9ncBxvxcpPqgo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3JLacchZPwjeBP68mM8x4ppN98WdtQKDcx7Xp-zqDpyY= BufferLoad: true BufferStore: true CUCount: null @@ -28411,28 +28382,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -28441,78 +28412,78 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 0 - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 + LdsNumBytes: 114688 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -28521,21 +28492,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28552,33 +28524,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 119 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28597,31 +28570,31 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -28631,8 +28604,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28640,7 +28614,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3GzPpMCCg6eosyo0J9SwQ5vW0imarFZnMlHiOChdob7M= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1F29g57gDKx143xeF4Ry8TpDfmB9904AahmpgslE7iFA= BufferLoad: true BufferStore: true CUCount: null @@ -28650,81 +28624,81 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 + LVCA: 4 + LVCB: 4 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -28732,26 +28706,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -28759,23 +28733,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -28783,7 +28758,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28791,33 +28766,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 120 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28833,34 +28809,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -28868,10 +28844,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28879,7 +28856,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3NSZxol_RK3ulhtjKNVM4wmncpvgvfydSWBZJx3QPOQ4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Va3I0gXvAjl6WSNm7VfyQxdSiay22E7f07A2gXBFahE= BufferLoad: true BufferStore: true CUCount: null @@ -28890,130 +28867,131 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 37888 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 37888 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 37888 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 128 + MacroTileA: 16 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29030,19 +29008,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 121 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -29052,11 +29030,12 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29076,41 +29055,42 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29118,37 +29098,36 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x384x32_MI32RlWxLGDTWZh3JudVunUYayPOa5L9KhMoWN1WpS2AQ3o= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -29158,38 +29137,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA6_NTB0_NTC3_NTD1_NTM0_NEPBS8_NLCA5_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 LSCB: 128 - LSPA: 32 + LSPA: 64 LSPB: 8 - LVCA: 8 + LVCA: 4 LVCB: 32 - LVPA: 8 + LVPA: 16 LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 46080 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 49152 + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 78848 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 90112 - LdsPadA: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 78848 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -29199,10 +29178,10 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -29210,26 +29189,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [5, 3] - MIWaveTileA: 5 - MIWaveTileB: 3 + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 384 - MacroTileA: 160 - MacroTileB: 384 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -29237,22 +29216,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 240 - NumLoadsA: 5 - NumLoadsB: 12 - NumLoadsCoalescedA: 5 - NumLoadsCoalescedB: 3 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29269,33 +29249,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 122 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA6_NTB0_NTC3_NTD1_NTM0_NEPBS8_NLCA5_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 3 - ThreadTileA: 80 - ThreadTileB: 3 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29311,36 +29292,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -29350,6 +29331,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29357,7 +29339,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3OoiSIXWFvbd682u5GKn1jw4rqir0xCc9vunjY-HgA8Q= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yMCAGzp9nwTbEima2Un3PvZ0HXHBQIMvg1dFgzYWa_Y= BufferLoad: true BufferStore: true CUCount: null @@ -29367,28 +29349,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -29397,19 +29379,19 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 @@ -29437,10 +29419,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29449,14 +29431,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29476,16 +29458,16 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -29508,19 +29490,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 123 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -29531,10 +29513,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29550,11 +29532,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -29562,33 +29544,34 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29596,7 +29579,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32st4uOxQc9vEhWZhR3gpfTesG3QAsHuiqHPy0f-iqroE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qduvuzsXPPzFHa_P72l7BR-VshFrlFZLdSkdePVgfp4= BufferLoad: true BufferStore: true CUCount: null @@ -29610,23 +29593,23 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -29636,39 +29619,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -29679,7 +29662,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29688,14 +29671,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 256 + MacroTile1: 192 MacroTileA: 128 - MacroTileB: 256 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29715,22 +29698,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29747,22 +29730,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 124 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -29771,15 +29754,15 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTileB: 3 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -29793,7 +29776,7 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -29801,22 +29784,22 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -29828,6 +29811,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29835,7 +29819,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI328rtlOkIRB2U8sv7gASwl7nDep88FFUev28ZzQZ-C8MY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1LmAvRmupSNgrSah59TXAt88n3c4wrKF6OZh9XiWLgCg= BufferLoad: true BufferStore: true CUCount: null @@ -29845,28 +29829,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -29875,50 +29859,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29927,14 +29911,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29948,28 +29932,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 16 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29986,22 +29970,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 125 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -30009,16 +29993,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -30031,8 +30015,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -30040,33 +30024,34 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30074,12 +30059,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1G3yMdb7BIPmH0-pBdSqMzeJXAKcONTAp7QuSBNzCeH4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1npxiDxAnSPzJZrvFG5oOPfnm5vME6tIZYSSv4aPUpfk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -30088,24 +30073,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -30114,39 +30099,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 64 + LSPA: 8 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9728 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 9728 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9728 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -30154,35 +30139,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -30193,23 +30178,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30217,7 +30202,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30225,39 +30210,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 126 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -30267,34 +30252,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -30306,6 +30291,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30313,12 +30299,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1d95FnwSepCpQ1VNN-c9bIuWmSq8NCHH4Yv9ZeAeUB5U= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vIn45BtBGJJHgOeHh-5A8gVBnOI_bnqOc5L-OKq7gYA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -30327,23 +30313,23 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -30353,37 +30339,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 8192 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 8192 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -30393,35 +30379,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -30432,23 +30418,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 + NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30464,37 +30450,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 127 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -30510,32 +30496,32 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -30543,8 +30529,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30552,77 +30539,77 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1vdET2FEVxtnfxZJASjE4Mpu0qbCqFDyzkii6x1dbHPs= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1JYtzm5Xt3uGxXdOwg30qeZ_F0x7lSMwRDJAgKTAi_Gc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 LSPB: 8 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -30630,72 +30617,72 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [2, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30703,32 +30690,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 128 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -30755,35 +30742,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30791,141 +30779,141 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1XVsYbgnD3R965KnpJIJjvIue0CI2SvPxRYsIPhAomzo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ljCbO7E0xnyQwh63y6jBAf9qB84fAz-ikx5KuEYBkO4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -30942,19 +30930,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 129 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -30965,10 +30953,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -30984,45 +30972,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31030,12 +31019,11 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xk1pA2wRYcqyCR46mTdGjDlhdTU15it1Jh10auM8T_po= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -31044,63 +31032,63 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 LSCA: 64 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 4096 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 40960 LdsInitCVgprs: false LdsNumBytes: 40960 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 73728 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -31110,35 +31098,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -31149,22 +31137,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -31172,8 +31160,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -31181,19 +31169,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 130 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -31204,10 +31192,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -31223,45 +31211,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31269,7 +31258,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xWktQ7pkAc4eXErbVdZeJISaskwk40DfNVmN4lX1TIVk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xO78SfH8mQcoIMGAjMin7gcWpXOlIKbkUU6oKtuOViE= BufferLoad: true BufferStore: true CUCount: null @@ -31280,27 +31269,27 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -31309,37 +31298,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 LSCB: 128 - LSPA: 8 - LSPB: 4 + LSPA: 16 + LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 2 - LVPB: 1 + LVPA: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -31347,12 +31336,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31360,14 +31349,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 192 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 192 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -31382,29 +31371,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31420,32 +31409,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 131 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 2 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 64 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 48 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 48 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -31465,42 +31454,43 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31508,7 +31498,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16x_DAw4csScAbaMhQdK6dmv9jQOzE5sE-cLR2YRUIrRvU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT95yVDjp7dKdvEwAelX8U-m0LR7_oTUhNUc468WYqSMis= BufferLoad: true BufferStore: true CUCount: null @@ -31518,27 +31508,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -31548,80 +31538,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 16 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 128 LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSPB: 8 + LVCA: 8 + LVCB: 32 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 77824 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 77824 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -31634,18 +31624,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31661,33 +31649,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 132 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC4_WGMXCCGn1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -31701,12 +31689,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -31716,35 +31701,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31752,7 +31738,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x48x64_MI16x1MqDWekebqrvXnO-Sb3hzu3z0gjkhgQRF_inqkjHmDjg= BufferLoad: true BufferStore: true CUCount: null @@ -31762,27 +31747,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -31792,80 +31777,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 16 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22528 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 22528 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22528 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 48 - MacroTileA: 32 - MacroTileB: 48 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -31874,29 +31859,27 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 8 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 64 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -31905,39 +31888,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 133 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -31945,50 +31928,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31996,12 +31977,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x112x32_MI16EvzYM6tpm_TvnED2_SgqZlUy-UXsf-lBMnZ88QQbEGY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vlnE_4luus1zCUHcypOmdrR_sbwtiIA8OS0tI-q8Ru0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -32010,65 +31991,65 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1792_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS14_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1792 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 31232 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 31232 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 14848 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31232 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -32076,35 +32057,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 7] - MIWaveTileA: 2 - MIWaveTileB: 7 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 112 - MacroTileA: 128 - MacroTileB: 112 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -32117,23 +32098,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 14 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 7 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32141,7 +32120,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32149,37 +32128,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 134 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1792_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS14_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC32_WGMXCCGn1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 7 - ThreadTileA: 8 - ThreadTileB: 7 + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -32189,39 +32168,36 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -32231,8 +32207,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32240,7 +32217,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32FmIQnqRk09hQ4dEBba9m-GH9ijUUN9SUEMw6YgNDXO4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1SDfMH-27Y7mEuNmRwG8kvOzYZofhtz1GHOgEQ1JLt60= BufferLoad: true BufferStore: true CUCount: null @@ -32250,28 +32227,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -32280,75 +32257,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 32 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 8 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -32366,25 +32343,23 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 8 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -32393,33 +32368,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 135 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC4_WGMXCCGn1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32433,41 +32408,38 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -32477,6 +32449,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32484,7 +32457,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xgmSt2MoRmL0g9WeUHU2DTu1EingvlwMUOcXgdGK6Qso= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3FvQPs5Yo_G_QmVYzG6yS8nTnH_iGTUQcTqM20yP63Jg= BufferLoad: true BufferStore: true CUCount: null @@ -32494,67 +32467,67 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -32565,10 +32538,10 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -32576,23 +32549,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -32603,25 +32576,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32637,32 +32608,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 136 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC1_WGMXCCGn1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -32677,50 +32648,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32728,7 +32697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xcj81g2ldg0_1p4EGvdXPUPXmYszqzbqqiK6W7fsyN7o= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qHETGII6d0uSQZoEXMbPslbRwZtZ2qEIdMes7tMaKUM= BufferLoad: true BufferStore: true CUCount: null @@ -32738,27 +32707,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -32768,27 +32737,27 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 768 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 46080 LdsInitCVgprs: false - LdsNumBytes: 114688 + LdsNumBytes: 46080 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 13312 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -32797,46 +32766,46 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 + LdsOffsetMetadata: 46080 LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -32847,25 +32816,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 0 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 + NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32881,33 +32848,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 137 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32921,12 +32888,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -32936,24 +32900,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -32963,8 +32927,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32972,7 +32937,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3h_261KD0u2Vj7WNjoAsXe6crAZkIpfL16hCc4AFHrww= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VoHL4lZ09fFgLKb3kjhbcWCkAbytz4WOa5AXsyzVOTg= BufferLoad: true BufferStore: true CUCount: null @@ -32984,26 +32949,26 @@ DebugStreamK: 0 DepthU: 64 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -33012,37 +32977,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 163840 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 163840 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 81920 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 131072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 131072 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -33050,13 +33015,13 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -33064,52 +33029,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 12 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -33125,33 +33088,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 138 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33165,50 +33128,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -33216,7 +33177,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3NfWsJl31L67XB35DT_qzFYelPH2nanMC9vO0xyiABZs= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT26xQYo7HSoTsdd4K1SIwcui68F8QKoN7vKJthhD9GJjQ= BufferLoad: true BufferStore: true CUCount: null @@ -33228,25 +33189,25 @@ DebugStreamK: 0 DepthU: 64 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -33256,51 +33217,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 3584 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 149504 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 149504 + LdsNumElementsAlignedA: 58368 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 74752 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 133120 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 58368 + LdsOffsetMetadata_Blk: 133120 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -33308,52 +33269,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 224 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 224 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 7 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -33369,33 +33328,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 139 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSwapAddr: true + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33409,50 +33368,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -33460,7 +33417,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xagL1cGCgYkLevIGoGnEP5DIN0wWTgYBlqRG_U06ddkE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT220m3qKLTrCkhh9vPv_Zxw_hfLycTgfLtMkglSrwNBmU= BufferLoad: true BufferStore: true CUCount: null @@ -33472,26 +33429,26 @@ DebugStreamK: 0 DepthU: 64 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -33500,80 +33457,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 3584 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 149504 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 149504 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 74752 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 133120 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 + LdsOffsetMetadata: 58368 + LdsOffsetMetadata_Blk: 133120 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -33582,29 +33539,27 @@ NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 7 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 8 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -33613,33 +33568,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 140 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33653,329 +33608,326 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - [2, 3, 0, 1] - - - [120, 256, 1, 8192] - - [44, 0.0] + - [31, 0.0] - - [128, 256, 1, 8192] - [0, 0.0] - - [128, 2440, 1, 8192] - - [46, 0.0] + - [32, 0.0] - - [128, 5120, 1, 8192] - - [135, 158872.0] + - [122, 68.21] - - [128, 5640, 1, 8192] - - [1, 0.0] + - [123, 70.77] - - [256, 120, 1, 8192] - - [136, 35579.9] + - [91, 35579.9] - - [256, 256, 1, 8192] - - [48, 0.0] + - [34, 0.0] - - [256, 512, 1, 8192] - - [2, 0.0] + - [1, 0.0] - - [256, 528, 1, 8192] - - [49, 0.0] + - [136, 37.1] - - [256, 2048, 1, 8192] - - [3, 0.0] + - [124, 72.57] - - [512, 120, 1, 8192] - - [137, 54795.5] + - [92, 54795.5] - - [512, 128, 1, 8192] - - [4, 0.0] + - [2, 0.0] - - [512, 256, 1, 8192] - - [5, 0.0] + - [3, 0.0] - - [512, 512, 1, 8192] - - [6, 0.0] + - [4, 0.0] - - [512, 528, 1, 8192] - - [50, 0.0] + - [35, 0.0] - - [512, 1980, 1, 8192] - - [51, 0.0] + - [36, 0.0] - - [512, 2048, 1, 8192] - - [7, 0.0] + - [5, 0.0] - - [528, 256, 1, 8192] - - [52, 0.0] + - [37, 0.0] - - [1024, 512, 1, 8192] - - [53, 0.0] + - [38, 0.0] - - [1980, 512, 1, 8192] - - [54, 0.0] + - [39, 0.0] - - [2048, 512, 1, 8192] - - [8, 0.0] + - [6, 0.0] - - [2820, 5640, 1, 8192] - - [9, 0.0] + - [7, 0.0] - - [3072, 512, 1, 8192] - - [56, 0.0] + - [41, 0.0] - - [3960, 512, 1, 8192] - - [57, 0.0] + - [42, 0.0] - - [4352, 128, 1, 8192] - - [10, 0.0] + - [127, 65.33] - - [4352, 256, 1, 8192] - - [58, 0.0] + - [43, 0.0] - - [4608, 256, 1, 8192] - - [59, 0.0] + - [44, 0.0] - - [5120, 128, 1, 8192] - - [11, 0.0] + - [128, 66.51] - - [5640, 128, 1, 8192] - - [12, 0.0] + - [129, 68.8] - - [5640, 2820, 1, 8192] - - [13, 0.0] + - [8, 0.0] - - [6912, 128, 1, 8192] - - [14, 0.0] + - [130, 71.97] - - [7296, 128, 1, 8192] - - [60, 0.0] + - [45, 0.0] - - [10880, 128, 1, 8192] - - [15, 0.0] + - [9, 0.0] - - [4, 128, 8192, 30] - - [16, 0.0] + - [10, 0.0] - - [16, 128, 8192, 33] - - [17, 0.0] + - [120, 6.95] - - [40, 128, 8192, 64] - - [18, 0.0] + - [121, 17.71] - - [128, 128, 1, 17711] - - [81, 0.0] + - [58, 0.0] - - [128, 960, 1, 17711] - - [83, 0.0] + - [126, 52.82] - - [128, 2480, 1, 17711] - - [84, 0.0] + - [125, 67.94] - - [128, 17711, 1, 41] - - [86, 0.0] + - [99, 10.05] - - [384, 17711, 1, 246] - - [109, 0.0] + - [73, 0.0] - - [384, 17711, 1, 768] - - [19, 0.0] + - [97, 80.41] - - [768, 96, 1, 17711] - - [114, 0.0] + - [77, 0.0] - - [887, 256, 1, 17711] - - [115, 0.0] + - [132, 65.02] - - [928, 128, 1, 17711] - - [116, 0.0] + - [133, 50.17] - - [2732, 384, 1, 17711] - - [122, 0.0] + - [131, 93.64] - - [28, 256, 1, 4096] - - [61, 0.0] + - [119, 2.75] - - [28, 320, 1, 4096] - - [62, 0.0] + - [115, 3.3] - - [64, 512, 1, 4096] - - [69, 0.0] + - [113, 10.53] - - [72, 256, 1, 4096] - - [71, 0.0] + - [112, 6.35] - - [72, 320, 1, 4096] - - [72, 0.0] + - [112, 7.69] - - [80, 512, 1, 4096] - - [73, 0.0] + - [107, 12.45] - - [96, 512, 1, 4096] - - [76, 0.0] + - [109, 14.97] - - [116, 256, 1, 4096] - - [77, 0.0] + - [54, 0.0] - - [116, 320, 1, 4096] - - [78, 0.0] + - [55, 0.0] - - [128, 2048, 1, 4096] - - [20, 0.0] + - [100, 46.7] - - [160, 512, 1, 4096] - - [21, 0.0] + - [118, 20.26] - - [180, 256, 1, 4096] - - [89, 0.0] + - [62, 0.0] - - [180, 320, 1, 4096] - - [90, 0.0] + - [63, 0.0] - - [256, 28, 1, 4096] - - [97, 0.0] + - [111, 2.75] - - [256, 72, 1, 4096] - - [98, 0.0] + - [108, 6.24] - - [256, 116, 1, 4096] - - [99, 0.0] + - [116, 9.6] - - [256, 180, 1, 4096] - - [103, 0.0] + - [110, 13.36] - - [256, 256, 1, 4096] - - [105, 0.0] + - [114, 18.23] - - [256, 7680, 1, 4096] - - [106, 0.0] + - [71, 0.0] - - [512, 160, 1, 4096] - - [111, 0.0] + - [117, 19.51] - - [512, 512, 1, 4096] - - [22, 0.0] + - [11, 0.0] - - [512, 2246, 1, 4096] - - [112, 0.0] + - [75, 0.0] - - [1600, 128, 1, 4096] - - [118, 0.0] + - [104, 36.46] - - [1824, 2048, 1, 4096] - - [23, 0.0] + - [12, 0.0] - - [2048, 57, 1, 4096] - - [120, 0.0] + - [79, 0.0] - - [2048, 64, 1, 4096] - - [121, 0.0] + - [80, 0.0] - - [2048, 82, 1, 4096] - - [24, 0.0] + - [13, 0.0] - - [2048, 160, 1, 4096] - - [25, 0.0] + - [105, 44.82] - - [2048, 2048, 1, 4096] - - [26, 0.0] + - [14, 0.0] - - [2246, 512, 1, 4096] - - [27, 0.0] + - [15, 0.0] - - [4132, 256, 1, 4096] - - [124, 0.0] + - [81, 0.0] - - [4132, 512, 1, 4096] - - [125, 0.0] + - [82, 0.0] - - [7680, 256, 1, 4096] - - [28, 0.0] + - [16, 0.0] - - [7680, 512, 1, 4096] - - [29, 0.0] + - [17, 0.0] - - [28, 32, 8192, 28] - - [126, 0.0] + - [83, 0.0] - - [32, 25, 8192, 25] - - [127, 0.0] + - [84, 0.0] - - [32, 64, 4096, 57] - - [128, 0.0] + - [85, 0.0] - - [32, 64, 4096, 82] - - [129, 0.0] + - [86, 0.0] - - [48, 160, 4096, 192] - - [30, 0.0] + - [18, 0.0] - - [48, 160, 4096, 642] - - [31, 0.0] + - [19, 0.0] - - [64, 200, 4096, 32] - - [130, 0.0] + - [87, 0.0] - - [160, 64, 96, 4096] - - [32, 0.0] + - [20, 0.0] - - [200, 64, 4096, 32] - - [33, 0.0] + - [21, 0.0] - - [8, 256, 1, 2048] - - [34, 0.0] + - [22, 0.0] - - [16, 256, 1, 2048] - - [35, 0.0] + - [23, 0.0] - - [32, 256, 1, 2048] - - [36, 0.0] + - [24, 0.0] - - [36, 256, 1, 2048] - - [37, 0.0] + - [25, 0.0] - - [40, 256, 1, 2048] - - [38, 0.0] + - [26, 0.0] - - [48, 256, 1, 2048] - - [39, 0.0] + - [27, 0.0] - - [64, 256, 1, 2048] - - [40, 0.0] + - [28, 0.0] - - [72, 256, 1, 2048] - - [41, 0.0] + - [29, 0.0] - - [80, 256, 1, 2048] - - [42, 0.0] + - [30, 0.0] - - [96, 256, 1, 2048] - - [132, 9584.86] + - [89, 9584.86] - - [128, 256, 1, 2048] - - [82, 0.0] + - [59, 0.0] - - [256, 128, 1, 2048] - - [101, 0.0] + - [68, 0.0] - - [256, 256, 1, 2048] - - [104, 0.0] + - [70, 0.0] - - [64, 128, 1, 8192] - - [43, 0.0] + - [134, 5.16] - - [128, 128, 1, 8192] - - [45, 0.0] + - [135, 9.41] - - [256, 128, 1, 98304] - - [47, 0.0] + - [33, 0.0] - - [1980, 1024, 1, 8192] - - [55, 0.0] + - [40, 0.0] - - [57, 32, 1, 262144] - - [63, 0.0] + - [46, 0.0] - - [64, 64, 1, 102400] - - [64, 0.0] + - [47, 0.0] - - [64, 64, 1, 131072] - - [65, 0.0] + - [48, 0.0] - - [64, 64, 1, 819200] - - [66, 0.0] + - [49, 0.0] - - [64, 128, 1, 1024] - - [67, 0.0] + - [106, 1.02] - - [64, 128, 1, 131072] - - [68, 0.0] + - [50, 0.0] - - [72, 128, 1, 1024] - - [70, 0.0] + - [51, 0.0] - - [82, 32, 1, 262144] - - [74, 0.0] + - [52, 0.0] - - [96, 128, 1, 1024] - - [75, 0.0] + - [53, 0.0] - - [128, 64, 1, 131072] - - [79, 0.0] + - [56, 0.0] - - [128, 128, 1, 1024] - - [80, 0.0] + - [57, 0.0] - - [128, 4096, 1, 1024] - - [85, 0.0] + - [94, 36.1] - - [128, 7456, 1, 1024] - - [138, 97916.7] + - [93, 49.79] - - [144, 128, 1, 1024] - - [87, 0.0] + - [60, 0.0] - - [160, 10, 1, 655360] - - [88, 0.0] + - [61, 0.0] - - [192, 48, 1, 655360] - - [91, 0.0] + - [64, 0.0] - - [192, 112, 1, 655360] - - [92, 0.0] + - [137, 61.51] - - [224, 64, 1, 527553] - - [93, 0.0] + - [138, 49.28] - - [224, 64, 1, 752863] - - [94, 0.0] + - [139, 52.03] - - [233, 56, 1, 131072] - - [95, 0.0] + - [65, 0.0] - - [252, 128, 1, 17711] - - [96, 0.0] + - [66, 0.0] - - [256, 128, 1, 1024] - - [100, 0.0] + - [67, 0.0] - - [256, 128, 1, 17711] - - [102, 0.0] + - [69, 0.0] - - [256, 7968, 1, 1024] - - [107, 0.0] + - [96, 65.32] - - [288, 64, 1, 806154] - - [108, 0.0] + - [72, 0.0] - - [512, 128, 1, 1024] - - [110, 0.0] + - [74, 0.0] - - [512, 2011, 1, 1024] - - [139, 100794.0] + - [95, 52.72] - - [642, 304, 1, 655360] - - [113, 0.0] + - [76, 0.0] - - [1024, 128, 1, 2048] - - [117, 0.0] + - [78, 0.0] - - [2011, 512, 1, 1024] - - [119, 0.0] + - [103, 50.39] - - [4096, 128, 1, 1024] - - [123, 0.0] + - [102, 32.27] - - [20, 48, 17711, 124] - - [133, 30249.3] + - [90, 30249.3] - - [128, 128, 6, 17711] - - [131, 0.0] + - [88, 0.0] - - [128, 17711, 6, 128] - - [134, 90258.2] + - [98, 49.6] - - [7968, 256, 1, 1024] - - [140, 135783.0] + - [101, 64.89] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml new file mode 100644 index 00000000000..53d68a7b194 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml @@ -0,0 +1,11501 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 0058] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DataTypeA: 0 + DataTypeAmaxD: 0 + DataTypeB: 0 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 10 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Yk7kOAJB_aE6VWqrVWf__KLZBXdHS4G4A_wz_1sHzCU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68GNy3DOANu_i2VXAB1Y2BeloAMEwWh68Ero33soSvMM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6wJcIFiIXcL9xRCcdjHOdxHUlrFCn9hAkB7Ii07m5Oeo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6MFyMQpdiVOcqKVaa7dbJIhJPdEmQ0U9Tqd7zT-vG3WY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3nYUFHmrSHNp-n-T71OSxpo2-QMWXfro-CzcYssiDH2w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 114688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 114688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VJH_RBajY8L_kN5xvMaq7RQvJq3SsD2MoUk710Se8-I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 147456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 147456 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 7 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6rqfD3ghhAs2e9z1zuI_aBgcgETHE2YH-hIjPMwl5H1A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57344 + LdsInitCVgprs: false + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6M8k0zgCr79GezsF4koVKCwmzswSUcG00QTbkQGZOzc4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 + LdsInitCVgprs: false + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 98304 + LdsInitCVgprs: false + LdsNumBytes: 98304 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 163840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3to8cjWEs6y4SguQB5Eo9GwEWnTleft3p6-QhBINReWU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3eZ5RqKxB1O3236AeikNxnkmOhdKPC4D_VFkvtbj1K28= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WumQmrRZO9w1nG8nVI-z2VGL_a-D9ZwkiDz46zvf_bQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3oeOmeGqaW_jaSFGz_jm6w84YNB03l0N-hq9s6Jgreqc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT348doMK9QEbWRou0upuvJNPJpi_evI_jhEzrHZU7JHfw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjgFvuyb2NeEcS1wEFnStawG246sqNqAes5-pKzM548= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3-LSPS0cmSEhfqGL2SuyEC3Fz2PkCkvfzxBANwy6XxN4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3JLacchZPwjeBP68mM8x4ppN98WdtQKDcx7Xp-zqDpyY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1F29g57gDKx143xeF4Ry8TpDfmB9904AahmpgslE7iFA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Va3I0gXvAjl6WSNm7VfyQxdSiay22E7f07A2gXBFahE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37888 + LdsInitCVgprs: false + LdsNumBytes: 37888 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37888 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 128 + MacroTileA: 16 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46080 + LdsInitCVgprs: false + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 78848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 78848 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yMCAGzp9nwTbEima2Un3PvZ0HXHBQIMvg1dFgzYWa_Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qduvuzsXPPzFHa_P72l7BR-VshFrlFZLdSkdePVgfp4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1LmAvRmupSNgrSah59TXAt88n3c4wrKF6OZh9XiWLgCg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1npxiDxAnSPzJZrvFG5oOPfnm5vME6tIZYSSv4aPUpfk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vIn45BtBGJJHgOeHh-5A8gVBnOI_bnqOc5L-OKq7gYA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1JYtzm5Xt3uGxXdOwg30qeZ_F0x7lSMwRDJAgKTAi_Gc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ljCbO7E0xnyQwh63y6jBAf9qB84fAz-ikx5KuEYBkO4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xO78SfH8mQcoIMGAjMin7gcWpXOlIKbkUU6oKtuOViE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT95yVDjp7dKdvEwAelX8U-m0LR7_oTUhNUc468WYqSMis= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 77824 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 77824 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vlnE_4luus1zCUHcypOmdrR_sbwtiIA8OS0tI-q8Ru0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1SDfMH-27Y7mEuNmRwG8kvOzYZofhtz1GHOgEQ1JLt60= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 32 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57344 + LdsInitCVgprs: false + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3FvQPs5Yo_G_QmVYzG6yS8nTnH_iGTUQcTqM20yP63Jg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qHETGII6d0uSQZoEXMbPslbRwZtZ2qEIdMes7tMaKUM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46080 + LdsInitCVgprs: false + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 13312 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VoHL4lZ09fFgLKb3kjhbcWCkAbytz4WOa5AXsyzVOTg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 163840 + LdsInitCVgprs: false + LdsNumBytes: 163840 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 81920 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 12 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT26xQYo7HSoTsdd4K1SIwcui68F8QKoN7vKJthhD9GJjQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 149504 + LdsInitCVgprs: false + LdsNumBytes: 149504 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74752 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 133120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 58368 + LdsOffsetMetadata_Blk: 133120 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT220m3qKLTrCkhh9vPv_Zxw_hfLycTgfLtMkglSrwNBmU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 149504 + LdsInitCVgprs: false + LdsNumBytes: 149504 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74752 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 133120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 58368 + LdsOffsetMetadata_Blk: 133120 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [64, 128, 1, 1024] + - [0, 1.02] + - - [128, 7456, 1, 1024] + - [1, 49.79] + - - [128, 4096, 1, 1024] + - [2, 36.1] + - - [512, 2011, 1, 1024] + - [3, 52.72] + - - [256, 7968, 1, 1024] + - [4, 65.32] + - - [384, 17711, 1, 768] + - [5, 80.41] + - - [128, 17711, 6, 128] + - [6, 49.6] + - - [128, 17711, 1, 41] + - [7, 10.05] + - - [128, 2048, 1, 4096] + - [8, 46.7] + - - [7968, 256, 1, 1024] + - [9, 64.89] + - - [4096, 128, 1, 1024] + - [10, 32.27] + - - [2011, 512, 1, 1024] + - [11, 50.39] + - - [1600, 128, 1, 4096] + - [12, 36.46] + - - [2048, 160, 1, 4096] + - [13, 44.82] + - - [80, 512, 1, 4096] + - [14, 12.45] + - - [256, 72, 1, 4096] + - [15, 6.24] + - - [96, 512, 1, 4096] + - [16, 14.97] + - - [256, 180, 1, 4096] + - [17, 13.36] + - - [256, 28, 1, 4096] + - [18, 2.75] + - - [72, 320, 1, 4096] + - [19, 7.69] + - - [64, 512, 1, 4096] + - [20, 10.53] + - - [256, 256, 1, 4096] + - [21, 18.23] + - - [28, 320, 1, 4096] + - [22, 3.3] + - - [256, 116, 1, 4096] + - [23, 9.6] + - - [512, 160, 1, 4096] + - [24, 19.51] + - - [72, 256, 1, 4096] + - [19, 6.35] + - - [160, 512, 1, 4096] + - [25, 20.26] + - - [28, 256, 1, 4096] + - [26, 2.75] + - - [16, 128, 8192, 33] + - [27, 6.95] + - - [40, 128, 8192, 64] + - [28, 17.71] + - - [128, 5120, 1, 8192] + - [29, 68.21] + - - [128, 5640, 1, 8192] + - [30, 70.77] + - - [256, 2048, 1, 8192] + - [31, 72.57] + - - [128, 2480, 1, 17711] + - [32, 67.94] + - - [128, 960, 1, 17711] + - [33, 52.82] + - - [4352, 128, 1, 8192] + - [34, 65.33] + - - [5120, 128, 1, 8192] + - [35, 66.51] + - - [5640, 128, 1, 8192] + - [36, 68.8] + - - [6912, 128, 1, 8192] + - [37, 71.97] + - - [2732, 384, 1, 17711] + - [38, 93.64] + - - [887, 256, 1, 17711] + - [39, 65.02] + - - [928, 128, 1, 17711] + - [40, 50.17] + - - [64, 128, 1, 8192] + - [41, 5.16] + - - [128, 128, 1, 8192] + - [42, 9.41] + - - [256, 528, 1, 8192] + - [43, 37.1] + - - [192, 112, 1, 655360] + - [44, 61.51] + - - [224, 64, 1, 527553] + - [45, 49.28] + - - [224, 64, 1, 752863] + - [46, 52.03] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index 69c681747d5..5c585965170 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -82,6 +82,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -131,7 +132,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_1 LDSTrInst: 1 LSCA: 64 LSCB: 32 @@ -240,7 +241,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -314,10 +315,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -325,7 +327,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x80x64_MI16xRVFK1XBidnBlbpNmd3CrBE33uuI0SAPL3Qyw7W8WneM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32i2_U0UMyp0MD6JrbAMSGAM4jMSqovGlng0TkZUPkdbU= BufferLoad: true BufferStore: true CUCount: null @@ -335,7 +337,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -349,13 +351,13 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -367,36 +369,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB5_NTC2_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 2048 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55808 + LdsBytesNoAmax: 117248 LdsInitCVgprs: false - LdsNumBytes: 55808 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55808 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -405,34 +407,34 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 80 + MacroTile1: 256 MacroTileA: 128 - MacroTileB: 80 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -444,21 +446,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 4 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 5 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -476,38 +478,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB5_NTC2_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -519,23 +521,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -550,10 +552,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -561,7 +564,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32i2_U0UMyp0MD6JrbAMSGAM4jMSqovGlng0TkZUPkdbU= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x48x32_MI16xkqGY-NF7CAM3EskAJ122WE1zJWH2MocSSQwjfR9WI0Q= BufferLoad: true BufferStore: true CUCount: null @@ -585,13 +588,13 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -603,36 +606,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 128 LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 + LdsBytesNoAmax: 24064 LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 24064 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 24064 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -640,35 +643,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 256 + MacroTile1: 48 MacroTileA: 128 - MacroTileB: 256 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -679,22 +682,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 0 NonTemporalB: 1 NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -703,7 +706,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -712,38 +715,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -755,16 +758,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -780,16 +783,17 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -797,20 +801,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x96x32_MI32x3liFjowZ1wszbsRAx60Qyy1bjIqtt9hLU0sWalgTr5k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16xDKRM6tGNXg9sJhCFV85LAtvtZ9sqZSu-_qtnT-ApUoA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -821,7 +825,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -829,7 +833,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -839,98 +843,98 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 16 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 16 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61632 + LdsBytesNoAmax: 20992 LdsInitCVgprs: false - LdsNumBytes: 61632 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 12480 + LdsNumBytes: 20992 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 35328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 35328 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 16 + MacroTile1: 128 + MacroTileA: 16 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -939,7 +943,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -948,38 +952,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -992,15 +996,15 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1013,7 +1017,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -1026,6 +1030,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1033,12 +1038,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x48x32_MI16xkqGY-NF7CAM3EskAJ122WE1zJWH2MocSSQwjfR9WI0Q= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI32xRnvoYb67eaAsgNwXE0Dn8TOG2hIUS2JkKOOoDbvvmi8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -1058,12 +1063,12 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -1075,36 +1080,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24064 + LdsBytesNoAmax: 22528 LdsInitCVgprs: false - LdsNumBytes: 24064 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 7680 + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 36864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24064 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 36864 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -1112,35 +1117,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -1151,22 +1156,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 5 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 0 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1175,7 +1180,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1184,34 +1189,34 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 - TransposeLDSMetadata: true + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: false @@ -1227,16 +1232,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1262,6 +1267,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1269,7 +1275,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16xDKRM6tGNXg9sJhCFV85LAtvtZ9sqZSu-_qtnT-ApUoA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x16x32_MI16x1ZSjm3pNsBWpJBqQ8vtHYNovksGMavhsTPTpgiIsoQJA= BufferLoad: true BufferStore: true CUCount: null @@ -1293,15 +1299,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -1311,35 +1317,35 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 8 + LSPB: 16 LVCA: 16 LVCB: 8 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 20992 + LdsBytesNoAmax: 10752 LdsInitCVgprs: false - LdsNumBytes: 20992 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 10752 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 2560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20992 - LdsOffsetMetadata_Blk: 35328 - LdsPadA: 8 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -1359,15 +1365,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 128 - MacroTileA: 16 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1387,23 +1393,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 12 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -1420,38 +1426,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1463,16 +1469,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1485,7 +1491,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -1494,10 +1500,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1505,7 +1512,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI32xRnvoYb67eaAsgNwXE0Dn8TOG2hIUS2JkKOOoDbvvmi8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32CJXo5F_EnC1qtNw819HS7RB4clh5fpaFNqA3ndpCH4M= BufferLoad: true BufferStore: true CUCount: null @@ -1535,7 +1542,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -1547,35 +1554,35 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 256 LSCB: 32 - LSPA: 32 + LSPA: 4 LSPB: 32 - LVCA: 8 + LVCA: 64 LVCB: 8 - LVPA: 8 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22528 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 22528 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 36864 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22528 - LdsOffsetMetadata_Blk: 36864 - LdsPadA: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -1595,15 +1602,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1623,22 +1630,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1656,38 +1663,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1699,14 +1706,14 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 @@ -1721,7 +1728,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -1730,10 +1737,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1741,20 +1749,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x16x32_MI16x1ZSjm3pNsBWpJBqQ8vtHYNovksGMavhsTPTpgiIsoQJA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x17euTeWN6jMQ7FVMh1Ll2e-mPEhVuyHHM5E6MJDowG_k= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -1765,15 +1773,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -1783,34 +1791,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 LSPA: 8 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 2 + LVCA: 32 + LVCB: 16 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 10752 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 10752 + LdsNumBytes: 57856 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 + LdsOffsetA_Blk: 32768 LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10752 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -1818,12 +1826,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -1831,15 +1839,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1853,29 +1861,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 7 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 + NumElementsPerBatchStore: 16 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -1883,7 +1891,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1892,32 +1900,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -1935,23 +1943,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -1960,8 +1968,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -1970,6 +1978,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1977,7 +1986,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32CJXo5F_EnC1qtNw819HS7RB4clh5fpaFNqA3ndpCH4M= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x19rPAmQ_VqFXH145RBghSm4KBn8pkGE6YhVYK32za2qc= BufferLoad: true BufferStore: true CUCount: null @@ -1987,10 +1996,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2001,15 +2010,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2019,48 +2028,48 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC6_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 1 - LVPB: 8 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -2068,49 +2077,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 6 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2128,38 +2137,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC6_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2171,29 +2180,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -2206,6 +2215,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2213,7 +2223,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xXABdh3bk2Jkiez4K4X68a3AY03PVVtvBtc_eW-i0fnQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x10ZIxmIfFOLgg0guqZLCypFSHKQGFlah6vEK6DSCcuO0= BufferLoad: true BufferStore: true CUCount: null @@ -2223,7 +2233,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -2237,7 +2247,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -2245,7 +2255,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2255,47 +2265,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC2_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 43008 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 43008 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -2303,15 +2313,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2331,22 +2341,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalC: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 12 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2355,7 +2365,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2364,13 +2374,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC2_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -2378,17 +2388,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 12 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 12 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -2414,16 +2424,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -2432,8 +2442,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -2442,6 +2452,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2449,20 +2460,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x64_MI16x1jE4wPYzIuURSj0o-J_xbjWAp2HQMeWqPCzVThoOuG2Q= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16GRSbzQx7ighSVLVcaqdzxzzKPcqoRCQBgH1GHRJJ7XM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2473,15 +2484,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2491,45 +2502,45 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 256 + LSCB: 32 + LSPA: 1 + LSPB: 32 + LVCA: 256 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28800 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 28800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -2539,15 +2550,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2561,29 +2572,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 3 NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 4 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -2591,7 +2602,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2600,38 +2611,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2643,29 +2654,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -2674,10 +2685,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2685,7 +2697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1bUjbUEv62p-Fld3AMB-QrB1lplQAKd-eS8s8Ga4rEis= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI160W_L8MAdvSH6iFJ8r3bD7tGc-qFAzcgrXURTRj3-VzI= BufferLoad: true BufferStore: true CUCount: null @@ -2709,7 +2721,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -2717,7 +2729,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2727,35 +2739,35 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB4_NTC6_NTD6_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 256 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPA: 1 + LSPB: 32 + LVCA: 256 LVCB: 8 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 17408 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -2767,7 +2779,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -2775,15 +2787,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2803,23 +2815,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 + NonTemporalA: 2 + NonTemporalB: 1 NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -2836,38 +2848,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB4_NTC6_NTD6_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2880,15 +2892,15 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -2901,7 +2913,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -2914,6 +2926,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2921,20 +2934,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xpJUyFtapYfNNS6S-_mCC2LXpy2KwwUrl2zn0psf2XY4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT144x256x32_MI16zGLdUFppgmxwRtFeiiI5CTCUno1nocQnuy352ZtGY3g= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2945,15 +2958,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2963,45 +2976,45 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC1_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 32 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPA: 16 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115200 + LdsBytesNoAmax: 123392 LdsInitCVgprs: false - LdsNumBytes: 115200 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3011,15 +3024,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [9, 4] + MIWaveTileA: 9 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 144 + MacroTile1: 256 + MacroTileA: 144 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3033,28 +3046,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 18 + NumLoadsB: 32 + NumLoadsCoalescedA: 9 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 32 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3063,7 +3076,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3072,13 +3085,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC1_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 @@ -3086,24 +3099,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 36 + ThreadTile1: 4 + ThreadTileA: 36 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3116,22 +3129,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -3140,8 +3153,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -3150,6 +3163,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3157,7 +3171,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1vVMzBO4mWqfB_Kye-esgwFbKcDpne8Tb620dLprWVBs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x1UDSP19TYSaBbDZT8LdxllcxelGKRB4v4VgMVknXSJ8Q= BufferLoad: true BufferStore: true CUCount: null @@ -3181,7 +3195,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -3189,7 +3203,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -3199,34 +3213,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57600 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 57600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -3247,15 +3261,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] + MIWaveGroup: [2, 2] MIWaveTile: [1, 2] MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3275,9 +3289,9 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 5 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 @@ -3285,12 +3299,12 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3308,7 +3322,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -3323,10 +3337,10 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -3358,9 +3372,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3382,10 +3396,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3393,12 +3408,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x128x32_MI32x--WTD2tEyvvP6yC7EC_hWmqA4rOL1W_DJqydY3VGIMM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI322Jt2ItX6xZZLSMKHsF4w0CbdGttb9SNJ6Nhz0_HK7WU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -3413,19 +3428,19 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -3435,35 +3450,35 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB5_NTC1_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 128 LSCB: 32 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 + LVCA: 64 LVCB: 8 - LVPA: 8 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 12288 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 12288 - LdsOffsetB_Blk: 45056 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12288 - LdsOffsetMetadata_Blk: 45056 - LdsPadA: 0 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -3484,14 +3499,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3511,22 +3526,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 12 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3534,7 +3549,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -3544,17 +3559,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB5_NTC1_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -3566,16 +3581,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 1 - ThreadTileA: 48 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3587,8 +3602,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -3596,7 +3611,7 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3609,11 +3624,11 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -3622,6 +3637,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3629,20 +3645,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x17euTeWN6jMQ7FVMh1Ll2e-mPEhVuyHHM5E6MJDowG_k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16wN6A5rnXqv-rHoqnpsJkJScWXV4931PtcBVxS-sQP-A= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3659,7 +3675,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3671,45 +3687,45 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 1 + LSPB: 32 + LVCA: 256 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3720,14 +3736,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3741,28 +3757,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3771,7 +3787,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3780,21 +3796,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -3802,16 +3818,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3823,41 +3839,42 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3865,20 +3882,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x19rPAmQ_VqFXH145RBghSm4KBn8pkGE6YhVYK32za2qc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xHp8Jp1EN-BsW4NNHKXj8wdhUGyxjiO0ZBU36jrCBic4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3888,6 +3905,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -3904,46 +3922,46 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC6_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 60416 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 60416 + LdsNumElementsAlignedA: 10240 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 43008 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 43008 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -3955,15 +3973,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3977,22 +3995,22 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 1 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -4016,8 +4034,8 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC6_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -4030,18 +4048,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 + StreamKXCCMapping: 4 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4051,49 +4069,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4101,7 +4122,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI16xcL6rbzCmMfe4R9bfmmNjYN2-TCjnsrf0xnm-fnL21cg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xMybmPilfmjdxVF3hhgbtCdONGixNfiKnsBpv98ldbF8= BufferLoad: true BufferStore: true CUCount: null @@ -4124,6 +4145,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -4131,7 +4153,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4140,48 +4162,48 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB7_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 64 + LSCA: 16 LSCB: 128 - LSPA: 4 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 16 LVCB: 32 - LVPA: 4 + LVPA: 16 LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 132096 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 132096 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66048 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -4191,15 +4213,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4219,22 +4241,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4243,7 +4265,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4252,32 +4274,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB7_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4287,7 +4309,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -4295,16 +4318,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4322,14 +4345,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4337,20 +4362,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x10ZIxmIfFOLgg0guqZLCypFSHKQGFlah6vEK6DSCcuO0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1zEU2zKTgrAqpo4O1jNbgKO5gG6NuTgAwAILYGhhS0vw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4360,6 +4385,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -4367,7 +4393,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -4376,46 +4402,46 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 16 + LSCA: 64 LSCB: 64 - LSPA: 64 + LSPA: 16 LSPB: 16 - LVCA: 4 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 31744 + LdsBytesNoAmax: 98816 LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 13312 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 98816 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13312 - LdsOffsetB_Blk: 46080 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31744 - LdsOffsetMetadata_Blk: 46080 - LdsPadA: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -4427,14 +4453,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 48 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -4449,27 +4475,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -4488,7 +4514,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4498,22 +4524,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4523,24 +4549,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4549,7 +4576,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -4558,14 +4585,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4573,12 +4602,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32uyg_KJ6W0X4wFR39DerojqVuONJUcgopuasEX7ENoHM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16pW7zOSW31Upy1wP-ePzJYYSpctNFPpPLDnJHKj_nfEw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -4596,14 +4625,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -4612,39 +4642,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 90112 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4652,35 +4682,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4691,22 +4721,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4715,7 +4745,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4724,7 +4754,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4733,8 +4763,8 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -4746,37 +4776,38 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4785,7 +4816,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -4794,14 +4825,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4809,12 +4842,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI321Jlp9ISKkogci-xniY90jo1KyQxQOK1ONDpQkef74Xc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16VBp-QCoHOpL6LyENWcMSpXtCN0fwPKVG3tklVSFxaog= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -4829,17 +4862,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -4848,39 +4882,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB1_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 90112 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4888,11 +4922,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -4900,23 +4934,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4929,20 +4963,20 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4950,8 +4984,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4960,59 +4994,60 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB1_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5021,7 +5056,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -5030,14 +5065,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5045,7 +5082,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x32_MI32aSmQIWyuX0F6x-_O4_1rqWaHuuYojs7JtkOyu6uFmvo= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xAu1FYXTFu65OW_QnFxWNJ9o3fDj1dpn7VH7-NENuD30= BufferLoad: true BufferStore: true CUCount: null @@ -5055,21 +5092,22 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -5077,108 +5115,108 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 39424 + LdsBytesNoAmax: 49408 LdsInitCVgprs: false - LdsNumBytes: 39424 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 49408 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 39424 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 5] + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 5 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 4 - NumLoadsB: 5 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5186,8 +5224,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5196,12 +5234,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM32_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -5211,17 +5249,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5231,7 +5269,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -5246,16 +5285,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5266,14 +5305,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5281,12 +5322,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xjO4OfkKCkFkgvj91kWXTcGRxZ3AGGPMy8KK_MmH8an4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xP3aS4yEsKaJ_WgMEwdb05dsMR2_LceuYOMf1OYI2fWk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -5304,6 +5345,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -5311,7 +5353,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -5320,10 +5362,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 64 LSCB: 32 @@ -5333,26 +5375,26 @@ LVCB: 8 LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25600 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 25600 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -5360,35 +5402,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 256 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -5400,21 +5442,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalB: 3 + NonTemporalC: 7 NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5423,7 +5465,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5432,17 +5474,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -5455,19 +5497,20 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -5475,16 +5518,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5502,14 +5545,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5517,7 +5562,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1-Fpr6LObdZOar2VTNmoL-twRh0ajwoRc5WwTKhLMmRI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16wI1VAKWPNdP9RHMqPy0EQsFIoAsqd-rof14TfmS3B7w= BufferLoad: true BufferStore: true CUCount: null @@ -5527,7 +5572,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -5540,8 +5585,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -5549,44 +5595,44 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -5596,10 +5642,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5607,15 +5653,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5635,22 +5681,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5659,7 +5705,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5668,7 +5714,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5676,24 +5722,24 @@ StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5703,49 +5749,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5753,20 +5802,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3Nv5gocJwu5_NWJBdSETiqKu7PzRDhU09DdXveu1VomQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32UbnKmTyiavN2akJm6VmZtXlrWfhYQJP2E3bwh8fwpA8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5776,53 +5825,54 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49280 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 49280 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -5830,10 +5880,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -5844,14 +5894,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5865,28 +5915,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 0 + NonTemporalB: 1 NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5904,7 +5954,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5912,13 +5962,13 @@ StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -5926,10 +5976,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5939,16 +5989,17 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -5956,16 +6007,16 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -5974,14 +6025,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5989,20 +6042,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x34cnnDEidhRk_WmWhuWyOvEKluncEq7Gtf72_AeBCIx4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32AdmC3-P-DJd8Y5s3WQ6Z37Tt-2n6SRcuWFce7LCusPk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6012,14 +6065,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -6028,46 +6082,46 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 256 LSCB: 32 - LSPA: 16 + LSPA: 2 LSPB: 32 - LVCA: 16 + LVCA: 128 LVCB: 8 - LVPA: 4 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49280 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 49280 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -6080,14 +6134,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6101,28 +6155,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6140,17 +6194,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -6162,29 +6216,30 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -6192,7 +6247,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6201,23 +6256,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6225,20 +6282,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3tlqjbYGbF8D3dhyvwAIN6W4godSGJACkw3uFX8nzkHg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32mu0WQNBr6Bcz0RHbPND4cTi3CAGuaMu12Tv6aUvNdvM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6248,6 +6305,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -6255,7 +6313,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6264,46 +6322,46 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 256 LSCB: 32 - LSPA: 4 + LSPA: 1 LSPB: 32 - LVCA: 64 + LVCA: 256 LVCB: 8 - LVPA: 4 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49280 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 49280 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -6316,14 +6374,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6337,28 +6395,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 4 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 7 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6376,21 +6434,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 26 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -6398,20 +6456,21 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -6419,16 +6478,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMapping: 16 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6446,14 +6505,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6461,20 +6522,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3RKQpWWg-CKz1NtcjvnA_BMu7gJCV-SEO9UGnUQ_IcX4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI327VropPoEGPK4p81XGOPYK7cGqJ9bMn-cBbte0oLvQa0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6484,6 +6545,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -6491,7 +6553,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6500,48 +6562,48 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LSCA: 256 + LSCB: 32 + LSPA: 1 + LSPB: 32 + LVCA: 256 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98560 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 98560 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -6552,14 +6614,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6573,27 +6635,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 7 - NonTemporalD: 0 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 32 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -6612,21 +6674,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 27 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -6634,44 +6696,45 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 32 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6682,14 +6745,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6697,7 +6762,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x128x32_MI32xCgBo5HHzliWlZq-opMQMpMxU6BF3PqWo19md9_NsQAs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x64_MI16xjyuL3eEnd6oM60MxKkSoj12f2_Rm9Y651748TEsy1cM= BufferLoad: true BufferStore: true CUCount: null @@ -6707,7 +6772,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -6720,6 +6785,7 @@ ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -6736,39 +6802,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 69120 LdsInitCVgprs: false - LdsNumBytes: 32256 + LdsNumBytes: 69120 LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 131072 LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 + LdsOffsetB_Blk: 144896 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 46592 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 69120 + LdsOffsetMetadata_Blk: 144896 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -6777,10 +6843,10 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -6788,23 +6854,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] + MIWaveTile: [3, 3] MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 48 + MacroTile1: 192 + MacroTileA: 48 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -6815,22 +6881,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 6 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 5 NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 NumLoadsA: 3 - NumLoadsB: 4 + NumLoadsB: 12 NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6848,13 +6914,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 28 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 @@ -6862,18 +6928,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 1 - ThreadTileA: 48 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6883,7 +6949,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -6898,34 +6965,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6933,7 +7002,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xxizCevnuKDk9O05gW7EGa90Gqudceas3oEQNoKTWBLM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x12XhFbXZExtNclf8zD5hKevb_buILS5setEE10T0PVPQ= BufferLoad: true BufferStore: true CUCount: null @@ -6943,7 +7012,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -6956,55 +7025,56 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB2_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 2 + LVCA: 4 + LVCB: 32 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58368 + LdsBytesNoAmax: 13824 LdsInitCVgprs: false - LdsNumBytes: 58368 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 13824 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 13824 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -7013,34 +7083,34 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [1, 2] MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -7051,23 +7121,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 5 + NonTemporalA: 2 + NonTemporalB: 6 + NonTemporalC: 1 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -7084,8 +7154,8 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 29 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB2_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -7093,22 +7163,22 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -7119,12 +7189,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -7134,18 +7205,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -7154,14 +7225,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7169,7 +7242,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xC6C0cUysf9CdYeZyDKzbyav_VZxNQyzLFs--R93kbs0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x32_MI32xRAh7ppQBNzWRsmJpnL4jlXiT-M-iht9O9UTSpXb7f7s= BufferLoad: true BufferStore: true CUCount: null @@ -7192,8 +7265,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -7201,44 +7275,44 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 + LVCA: 64 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 35840 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 35840 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 73728 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -7251,7 +7325,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7259,15 +7333,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7288,21 +7362,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 + NonTemporalB: 0 NonTemporalC: 5 - NonTemporalD: 4 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7320,7 +7394,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 30 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -7335,17 +7409,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7355,24 +7429,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7381,7 +7456,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -7390,14 +7465,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7405,20 +7482,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xXKBoXMSxWcqHlgfib00ccN-Wi64fvb6VNinYeZxgXnk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xCKUpZ9P25YvYGJ1Acu5q1cb_ng8B3vHgEyiubZYN-vs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7428,6 +7505,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7435,7 +7513,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -7444,100 +7522,100 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 4 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -7547,7 +7625,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7556,25 +7634,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 31 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -7582,42 +7660,43 @@ ThreadTile1: 2 ThreadTileA: 16 ThreadTileB: 2 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -7626,14 +7705,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7641,7 +7722,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI326WtBLaA2aXraePY7SVyMDed3tQL9zRIEHPEESGlauVA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xQ9WCQXg3kci4s1k-N1_jON3rjIc87HNykcNj6r53DuQ= BufferLoad: true BufferStore: true CUCount: null @@ -7651,7 +7732,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -7664,6 +7745,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7680,75 +7762,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB3_NTC7_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 132096 + LdsBytesNoAmax: 98816 LdsInitCVgprs: false - LdsNumBytes: 132096 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 98816 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66048 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 4] + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -7760,21 +7842,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7792,32 +7874,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 32 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB3_NTC7_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7827,49 +7909,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7877,20 +7962,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16GRSbzQx7ighSVLVcaqdzxzzKPcqoRCQBgH1GHRJJ7XM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3t8AEDJh-qtEP0j1P4XwXYzISUBJp3m20CgLweU5bvQ0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7900,14 +7985,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7916,100 +8002,100 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 32 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 49664 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 - LoopUnroll: 32 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 1 + NonTemporalA: 4 + NonTemporalB: 6 + NonTemporalC: 4 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -8028,7 +8114,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 33 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -8036,34 +8122,35 @@ StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 8 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8071,41 +8158,43 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8113,12 +8202,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI324F5sj-erxwaJrIU1muQSzHuxqiGNLO18cI0KvoEyRVM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16XOuZu64IcidHMClamzevEn0u2nxp4pZshzhk7HOLl9k= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -8136,6 +8225,7 @@ ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8143,7 +8233,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -8152,10 +8242,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 128 LSCB: 32 @@ -8165,26 +8255,26 @@ LVCB: 8 LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -8192,20 +8282,20 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 4] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 @@ -8217,10 +8307,10 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8232,15 +8322,15 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 1 + NonTemporalB: 3 NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 10 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -8255,7 +8345,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8264,17 +8354,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 34 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -8290,31 +8380,32 @@ ThreadTile1: 4 ThreadTileA: 16 ThreadTileB: 4 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 @@ -8325,7 +8416,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -8334,14 +8425,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8349,12 +8442,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI327AlSQH3iAJryoRlI_pnDbEaZckx7rs5nEeQh0g5BFdk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x256x32_MI16x8aOTHAb3oJ24lDR1GY5QlgCdE9ISN9B4sNQoCcICvs4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -8372,55 +8465,56 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 16 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 16 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 47616 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 78336 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -8428,35 +8522,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8467,22 +8561,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 6 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 5 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8491,7 +8585,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8500,17 +8594,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 35 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -8522,37 +8616,38 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8561,23 +8656,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8585,7 +8682,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI160W_L8MAdvSH6iFJ8r3bD7tGc-qFAzcgrXURTRj3-VzI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x64x32_MI16xZMetFWyGzjhHcbZJekE_6F7yBfklY-t1A7lJBhtt1oY= BufferLoad: true BufferStore: true CUCount: null @@ -8608,54 +8705,55 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 LSCB: 32 - LSPA: 1 + LSPA: 16 LSPB: 32 - LVCA: 256 + LVCA: 16 LVCB: 8 - LVPA: 1 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 3072 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 99328 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 25088 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25088 + LdsOffsetB_Blk: 90624 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 90624 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -8675,15 +8773,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8703,22 +8801,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 1 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8727,7 +8825,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8736,42 +8834,43 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 36 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8779,16 +8878,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8801,19 +8900,21 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8821,7 +8922,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT144x256x32_MI16zGLdUFppgmxwRtFeiiI5CTCUno1nocQnuy352ZtGY3g= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x192x32_MI16idrImgWgCO7nvcJQBWTDXiMZm2WoqSF8tVNvZiYJ_9I= BufferLoad: true BufferStore: true CUCount: null @@ -8844,54 +8945,55 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 8 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -8911,15 +9013,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [9, 4] - MIWaveTileA: 9 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 144 - MacroTile1: 256 - MacroTileA: 144 - MacroTileB: 256 + MacroTile0: 192 + MacroTile1: 192 + MacroTileA: 192 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8939,22 +9041,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 4 NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 18 - NumLoadsB: 32 - NumLoadsCoalescedA: 9 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8972,7 +9074,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 37 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -8982,49 +9084,50 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 36 - ThreadTile1: 4 - ThreadTileA: 36 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9033,23 +9136,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9057,20 +9162,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI32tdCNzBgk3lUDksKVviFzMyoUyHh69f9kY3FwffgwOyg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xSDxhcZ3u4iSYJMHirZW1CTHGblCpuWSDzLc_0f9fOAo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9080,8 +9185,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -9089,57 +9195,57 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -9147,15 +9253,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9169,28 +9275,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 192 - NumLoadsA: 24 - NumLoadsB: 8 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9208,13 +9314,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 38 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -9223,27 +9329,28 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9252,22 +9359,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -9278,14 +9385,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9293,20 +9402,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x32x32_MI32xVzdnSYsGhQaZbelX-CANSXeu3v1q-VFJ5TDW5AP_IiM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16vi3ZztTU2ZooOHejWDnWTKb_gBIj8TaDNK9DtMbhZNs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9316,118 +9425,119 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 256 LSCB: 32 - LSPA: 8 + LSPA: 1 LSPB: 32 - LVCA: 32 + LVCA: 256 LVCB: 8 - LVPA: 2 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53312 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 53312 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 4160 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9435,7 +9545,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9444,17 +9554,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -9466,37 +9576,38 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9512,16 +9623,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9529,7 +9642,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x1UDSP19TYSaBbDZT8LdxllcxelGKRB4v4VgMVknXSJ8Q= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3Yo9YdRBKBj79FhGiWwDokhFsmBdvtYGmzRh04XflZAM= BufferLoad: true BufferStore: true CUCount: null @@ -9539,7 +9652,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -9552,8 +9665,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -9561,34 +9675,34 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 49280 LdsInitCVgprs: false - LdsNumBytes: 57856 + LdsNumBytes: 49280 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -9600,7 +9714,7 @@ LdsOffsetMetadata: 8192 LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -9609,10 +9723,10 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -9620,23 +9734,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9647,22 +9761,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 4 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9680,13 +9794,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 @@ -9695,17 +9809,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9715,33 +9829,34 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9750,14 +9865,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9765,7 +9882,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI322Jt2ItX6xZZLSMKHsF4w0CbdGttb9SNJ6Nhz0_HK7WU= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x160x32_MI16xPdajT8YH9ob0TRWzD1ldbfXmsBG0Yb-qwhjBqEUffYg= BufferLoad: true BufferStore: true CUCount: null @@ -9788,14 +9905,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -9804,39 +9922,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 LSCB: 32 - LSPA: 4 + LSPA: 32 LSPB: 32 - LVCA: 64 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 78336 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -9844,35 +9962,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9883,22 +10001,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 + NonTemporalA: 0 + NonTemporalB: 3 NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 5 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9907,7 +10025,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9916,42 +10034,43 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9959,8 +10078,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -9981,19 +10100,21 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10001,7 +10122,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16wN6A5rnXqv-rHoqnpsJkJScWXV4931PtcBVxS-sQP-A= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16D6x4dco45Qa1J1WWaOfvrVhnUVbkL6MHiGgtcXWPe0g= BufferLoad: true BufferStore: true CUCount: null @@ -10021,57 +10142,58 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 256 + LSCA: 128 LSCB: 32 - LSPA: 1 + LSPA: 8 LSPB: 32 - LVCA: 256 + LVCA: 32 LVCB: 8 - LVPA: 1 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -10091,15 +10213,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10119,22 +10241,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10142,7 +10264,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -10152,59 +10274,60 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 8 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10213,23 +10336,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10237,7 +10362,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xHp8Jp1EN-BsW4NNHKXj8wdhUGyxjiO0ZBU36jrCBic4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16g49CyeI2eVWDPXKcFHBWWzGtiVFxp9FPEDK9D-TTnuI= BufferLoad: true BufferStore: true CUCount: null @@ -10247,7 +10372,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -10262,7 +10387,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -10270,7 +10395,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10280,45 +10405,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 2560 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60416 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 60416 - LdsNumElementsAlignedA: 10240 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 20992 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 43008 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 86528 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 43008 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 86528 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -10328,15 +10453,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10357,20 +10482,20 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 6 + NonTemporalB: 1 NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 5 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -10380,7 +10505,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10389,8 +10514,8 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -10398,23 +10523,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 4 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10430,45 +10555,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10476,32 +10602,32 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xMybmPilfmjdxVF3hhgbtCdONGixNfiKnsBpv98ldbF8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1A-I78gXXrdOMKV_Cgj6SLbZMW0uS0cuPNCPj32U4D4A= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -10509,7 +10635,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10519,43 +10645,43 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 128 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 9216 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 9216 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -10567,14 +10693,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 2] MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -10589,36 +10715,36 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalA: 6 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -10628,8 +10754,8 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -10637,15 +10763,15 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -10669,7 +10795,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -10679,18 +10805,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10702,12 +10828,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10715,20 +10842,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xX5RiOrSD2VklAE5bBWSRJYAzYfFgytTNDEzPIilTQVc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1jVgAbVl7TAJozgubbJrw62tnMG_eow8t5K7k9PPC4WA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10740,15 +10867,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10758,43 +10885,43 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC4_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 128 - LSPA: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 + LVCA: 8 + LVCB: 8 + LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 8704 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -10807,14 +10934,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10828,29 +10955,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 + NonTemporalA: 7 + NonTemporalB: 2 NonTemporalC: 4 - NonTemporalD: 1 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -10858,7 +10985,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10867,17 +10994,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC4_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -10889,10 +11016,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10908,28 +11035,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10941,12 +11068,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10954,20 +11082,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1zEU2zKTgrAqpo4O1jNbgKO5gG6NuTgAwAILYGhhS0vw= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1TJ8fhKzIV0bLTU9VMa_LELZqPo5QayYI3rGq2ReOzWM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10979,7 +11107,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -10987,7 +11115,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10997,47 +11125,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 14848 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 14848 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 10240 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 + LdsOffsetMetadata: 14848 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -11045,14 +11173,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -11067,28 +11195,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 + NonTemporalA: 0 + NonTemporalB: 4 NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11106,13 +11234,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 46 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -11120,24 +11248,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11147,28 +11275,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -11180,12 +11308,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11193,7 +11322,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x128_MI164h6utKLGsGw5LYpCWTXGz2yJ1TVk4mfoOsLcmGvytb0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x32_MI16x1hlDZ2RQXNBkW0XTWcWRLJPdhG2QWccTGpKZ8d7Cm7gs= BufferLoad: true BufferStore: true CUCount: null @@ -11203,7 +11332,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -11218,7 +11347,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -11226,7 +11355,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -11236,34 +11365,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 16 LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 19456 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 65536 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 19456 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 65536 - LdsOffsetB_Blk: 196608 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 36864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 98816 - LdsOffsetMetadata_Blk: 196608 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 36864 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -11273,10 +11402,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 - MFMA_BF16_1K: false - MIArchVgpr: 0 + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -11284,15 +11413,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] + MIWaveGroup: [1, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11312,23 +11441,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 3 NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalC: 4 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -11336,7 +11465,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11345,12 +11474,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 47 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -11359,18 +11488,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11386,36 +11515,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -11425,6 +11554,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11432,20 +11562,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16pW7zOSW31Upy1wP-ePzJYYSpctNFPpPLDnJHKj_nfEw= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x32_MI16xiJ6qFagB646z5Vtl5Hvm-aiCfDXZyxNPc89xWsDkpZY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -11455,17 +11585,17 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -11475,35 +11605,35 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 16 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 16 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 64768 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 64768 + LdsNumElementsAlignedA: 6656 + LdsNumElementsAlignedB: 25344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 6656 + LdsOffsetB_Blk: 39424 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 + LdsOffsetMetadata: 6656 + LdsOffsetMetadata_Blk: 39424 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -11511,7 +11641,7 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -11524,14 +11654,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 48 + MacroTile1: 192 + MacroTileA: 48 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11551,22 +11681,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 + NonTemporalA: 0 + NonTemporalB: 5 NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11584,7 +11714,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 48 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -11593,8 +11723,8 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -11606,16 +11736,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11628,8 +11758,8 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -11646,24 +11776,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 2 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11671,7 +11802,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16VBp-QCoHOpL6LyENWcMSpXtCN0fwPKVG3tklVSFxaog= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x384x32_MI16xxRLiYtk8vs0qL-PXaqBrMaldMaCGmntAX1C6Jqy-srA= BufferLoad: true BufferStore: true CUCount: null @@ -11691,20 +11822,20 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -11714,34 +11845,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 16 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 16 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 62976 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 62976 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 73216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 62976 + LdsOffsetMetadata_Blk: 73216 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11754,7 +11885,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -11762,15 +11893,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 48 + MacroTile1: 384 + MacroTileA: 48 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11790,22 +11921,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11813,7 +11944,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -11823,32 +11954,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 49 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 4 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11867,16 +11998,16 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11885,7 +12016,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -11894,15 +12025,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11910,7 +12042,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xAu1FYXTFu65OW_QnFxWNJ9o3fDj1dpn7VH7-NENuD30= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3tgew_wmZ1d78svsdgjmhnAmiMolGNWzJ-VQ5Kmkq7s0= BufferLoad: true BufferStore: true CUCount: null @@ -11920,7 +12052,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -11935,7 +12067,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -11943,7 +12075,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -11953,72 +12085,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 98560 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 98560 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -12029,23 +12161,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: 4 + NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12053,7 +12187,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12062,12 +12196,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 50 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -12076,17 +12210,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -12101,9 +12235,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -12113,16 +12250,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -12131,17 +12268,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12149,7 +12287,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32vHcR0eKpkE7e7fyejfiaedHWaJa-n0ED8ZaCTBwI9lg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xRSQp-nzPx9YJcjSE6rpJqJMr777j07qBoTYEz5pwMuc= BufferLoad: true BufferStore: true CUCount: null @@ -12159,17 +12297,17 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -12180,7 +12318,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -12192,34 +12330,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 1 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 - LdsOffsetBias: 0 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -12227,10 +12365,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -12240,14 +12378,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 256 + MacroTileA: 64 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -12262,36 +12400,38 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -12301,32 +12441,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 51 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12340,28 +12480,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -12375,12 +12518,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12388,7 +12532,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xP3aS4yEsKaJ_WgMEwdb05dsMR2_LceuYOMf1OYI2fWk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32JxH2KKlBlMybG2dSQRlSPdHkRx_8g5fo45X9py5Jl3I= BufferLoad: true BufferStore: true CUCount: null @@ -12398,7 +12542,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -12431,36 +12575,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 68096 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 + LdsNumBytes: 68096 + LdsNumElementsAlignedA: 33280 LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 68096 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -12468,11 +12612,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -12480,23 +12624,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] + MIWaveTile: [4, 1] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -12507,23 +12651,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 4 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12531,7 +12677,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12540,7 +12686,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 52 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -12549,23 +12695,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 64 + ThreadTile1: 1 + ThreadTileA: 64 + ThreadTileB: 1 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12579,28 +12725,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -12609,17 +12758,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12627,20 +12777,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16wI1VAKWPNdP9RHMqPy0EQsFIoAsqd-rof14TfmS3B7w= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32pZ9lLvhCdmH7RpWQsDwDXdHIV2y4SsFgkBGs2DezxyQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12670,7 +12820,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 128 LSCB: 64 @@ -12680,48 +12830,48 @@ LVCB: 16 LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 133120 + LdsNumBytes: 66560 LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 + LdsOffsetA_Blk: 131072 LdsOffsetB: 32768 - LdsOffsetB_Blk: 99328 + LdsOffsetB_Blk: 163840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 163840 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 8] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 8 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 MacroTile1: 128 @@ -12732,28 +12882,28 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 @@ -12763,6 +12913,8 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12779,7 +12931,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 53 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -12787,24 +12939,24 @@ StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12818,20 +12970,23 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 48 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -12853,12 +13008,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12866,12 +13022,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32UbnKmTyiavN2akJm6VmZtXlrWfhYQJP2E3bwh8fwpA8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x64_MI16x08S02Y53B0Ne6ocNhqpSHhbrCU_jARBa0pnTDvEPOy4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -12891,7 +13047,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -12909,36 +13065,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 76288 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 76288 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 76288 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -12946,35 +13102,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -12985,23 +13141,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 4 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13018,7 +13176,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 54 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -13026,13 +13184,13 @@ StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -13040,16 +13198,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13057,21 +13215,24 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -13080,15 +13241,15 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -13096,8 +13257,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13105,38 +13267,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32AdmC3-P-DJd8Y5s3WQ6Z37Tt-2n6SRcuWFce7LCusPk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6JGSVgpt4GrxgPWh0ngWHRKLbLQ_tqOXKZqa8Lb0Ms-k= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -13145,50 +13307,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 256 + LSCA: 64 LSCB: 32 - LSPA: 2 + LSPA: 16 LSPB: 32 - LVCA: 128 + LVCA: 16 LVCB: 8 - LVPA: 1 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13196,14 +13358,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 256 + MacroTileA: 64 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -13216,30 +13378,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13257,38 +13420,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 55 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13301,42 +13465,43 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13344,90 +13509,89 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32mu0WQNBr6Bcz0RHbPND4cTi3CAGuaMu12Tv6aUvNdvM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 256 - LSCB: 32 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13435,14 +13599,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 256 + MacroTileA: 64 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -13455,30 +13619,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13496,38 +13661,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 56 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 + StreamKXCCMapping: 0 + SubGroup0: 2 SubGroup1: 64 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 64 + ThreadTileA: 32 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13537,45 +13703,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13583,90 +13750,90 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI327VropPoEGPK4p81XGOPYK7cGqJ9bMn-cBbte0oLvQa0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1tZVQklGtKOQ3p4IgWyAMn9jTY4tlNPIDxC3Y71614zM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 256 + LSCA: 128 LSCB: 32 - LSPA: 1 + LSPA: 8 LSPB: 32 - LVCA: 256 + LVCA: 32 LVCB: 8 - LVPA: 1 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13675,14 +13842,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13694,30 +13861,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 8 NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13735,38 +13903,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 57 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13779,42 +13948,43 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13822,7 +13992,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x64_MI16xjyuL3eEnd6oM60MxKkSoj12f2_Rm9Y651748TEsy1cM= BufferLoad: true BufferStore: true CUCount: null @@ -13832,27 +14001,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -13862,39 +14031,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 64 - LSPA: 64 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 69120 + LdsBytesNoAmax: 123904 LdsInitCVgprs: false - LdsNumBytes: 69120 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 144896 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 69120 - LdsOffsetMetadata_Blk: 144896 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -13903,37 +14072,37 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 192 - MacroTileA: 48 - MacroTileB: 192 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -13941,22 +14110,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 - NumLoadsB: 12 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13964,7 +14134,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -13974,38 +14144,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 58 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14015,34 +14186,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -14052,8 +14223,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14061,37 +14233,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x384x32_MI16LKEloQ4c6Y11zX7UW67eJt-lIulINZyn1htvg5ifdns= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19w6888cdHAWM4y5NYJiddGc0xmSYSG1iOCD6RgfOFHM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -14101,51 +14273,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA3_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 156672 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 156672 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 78336 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -14153,26 +14325,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [10, 6] - MIWaveTileA: 10 - MIWaveTileB: 6 + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 192 MacroTile1: 384 - MacroTileA: 160 + MacroTileA: 192 MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -14180,22 +14352,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 10 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 NumLoadsB: 12 - NumLoadsCoalescedA: 5 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14204,7 +14377,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14213,38 +14386,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 59 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA3_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 6 - ThreadTileA: 40 - ThreadTileB: 6 - TransposeLDS: 2 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14254,45 +14428,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14300,7 +14475,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x12XhFbXZExtNclf8zD5hKevb_buILS5setEE10T0PVPQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2ej76CXhxc1HCjfC8xyOXAnhi0iATAwRTzc4u3zxpLfA= BufferLoad: true BufferStore: true CUCount: null @@ -14310,28 +14485,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -14340,102 +14515,103 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 0 - LSCA: 16 - LSCB: 64 - LSPA: 16 - LSPB: 2 - LVCA: 4 - LVCB: 32 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13824 + LdsBytesNoAmax: 148736 LdsInitCVgprs: false - LdsNumBytes: 13824 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 148736 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 41600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 74368 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 107136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13824 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 107136 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14452,32 +14628,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 60 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSwapAddr: true + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 64 + ThreadTile1: 5 + ThreadTileA: 64 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14493,45 +14670,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14539,7 +14717,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x64x64_MI16x1U9LsBxJQCPuoeG_Ve_3oyvucaAXJtcLblknVGrUCZZs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2l9pCGbOtMUbslloqKGQiaMAYUl03KI_tqPnmAtre-YA= BufferLoad: true BufferStore: true CUCount: null @@ -14549,69 +14727,69 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA6_NTB7_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22528 + LdsBytesNoAmax: 130560 LdsInitCVgprs: false - LdsNumBytes: 22528 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 130560 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 37888 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22528 - LdsOffsetMetadata_Blk: 37888 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -14620,37 +14798,37 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -14658,23 +14836,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 7 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 7 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14691,32 +14870,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 61 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA6_NTB7_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14732,34 +14912,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -14769,8 +14949,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14778,7 +14959,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xziis8ptTdenpUsOV8XyKZ82SY2P3qZnzwGDCdyAHeJ8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2bO0CyY1hfbzTfbroYwjGkYL-2OnycNC5Ws1vnO03-EM= BufferLoad: true BufferStore: true CUCount: null @@ -14788,108 +14969,108 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -14897,22 +15078,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14920,8 +15102,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14930,32 +15112,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 62 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14974,31 +15157,31 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -15008,8 +15191,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15017,7 +15201,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x32_MI16x1Zz5Ec09HIvd_O008_hCtRDLwTrwyfoXpxKA0U2dQ67Y= BufferLoad: true BufferStore: true CUCount: null @@ -15029,130 +15212,131 @@ DebugStreamK: 0 DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 8 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12352 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 12352 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2048 - LdsOffsetMetadata_Blk: 10240 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15160,7 +15344,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15169,32 +15353,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 63 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15210,45 +15395,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15256,118 +15442,118 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xAXUwb1N4QNEBBAkwsktnlZYTW07RiPx_dgvcHi9E9M4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2hoMDmZeEsYCyvKLfTCHSZhNF7OrTU2H4lTaznjnrQWU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 124416 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 124416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 43008 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 43008 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -15376,21 +15562,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 6 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15399,7 +15586,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15408,36 +15595,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 64 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -15452,31 +15640,31 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -15488,6 +15676,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15495,7 +15684,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xY-N0YQfxOyAq02GUONAQj81wtsNxroPjTozQOGUD8fI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHeN2A1e9y-sybm4-VD6Rl-mXAAhE2oR_OYb0CV2Kb8= BufferLoad: true BufferStore: true CUCount: null @@ -15505,131 +15694,132 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 256 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 4 + LSPB: 32 LVCA: 16 - LVCB: 64 - LVPA: 16 - LVPB: 1 - LdsBlockSizePerPadA: 256 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15647,32 +15837,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 65 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15688,45 +15879,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15734,7 +15926,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16x1w5ApfbK1Jpefo37YDyHXcxDPX0iSzwdDGLbbUauHzQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kPOJu3SjIZ5ID-BcDpMScvXSs_-HeTA7aISuMEhdHHE= BufferLoad: true BufferStore: true CUCount: null @@ -15744,131 +15936,132 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB6_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15877,7 +16070,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15886,32 +16079,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 66 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB6_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15927,45 +16121,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15973,7 +16168,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x32_MI32xRAh7ppQBNzWRsmJpnL4jlXiT-M-iht9O9UTSpXb7f7s= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1CVWcLkzUFVZguyEpkQVePxOfwkSx0oBkGlgi_Q3qok0= BufferLoad: true BufferStore: true CUCount: null @@ -15987,63 +16182,63 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35840 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 35840 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35840 - LdsOffsetMetadata_Blk: 73728 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -16056,7 +16251,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16065,14 +16260,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16084,7 +16279,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -16094,20 +16289,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16125,18 +16321,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 67 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -16146,15 +16342,16 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -16169,8 +16366,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -16178,22 +16375,22 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -16201,10 +16398,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16212,12 +16410,11 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x224x32_MI16xYiD-R4lcoMOg1xQZgGsiJ1KZ02vOjJZGyWHp18MOnjY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -16226,65 +16423,65 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA4_NTB7_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 LSCA: 64 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 44032 + LdsBytesNoAmax: 123904 LdsInitCVgprs: false - LdsNumBytes: 44032 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 35840 + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 44032 - LdsOffsetMetadata_Blk: 73728 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -16292,11 +16489,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -16304,26 +16501,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 7] - MIWaveTileA: 2 - MIWaveTileB: 7 + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 224 - MacroTileA: 64 - MacroTileB: 224 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -16331,22 +16528,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16355,7 +16553,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16364,36 +16562,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 68 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA4_NTB7_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 7 - ThreadTileA: 8 - ThreadTileB: 7 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -16408,33 +16607,33 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -16442,8 +16641,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16451,38 +16651,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xCKUpZ9P25YvYGJ1Acu5q1cb_ng8B3vHgEyiubZYN-vs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6IhMdVhMfTI3xBN4uSqbJxvOYzvtnSVMCVX0u78IWd44= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -16491,50 +16691,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 LSCA: 64 - LSCB: 32 + LSCB: 128 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 32 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16542,15 +16742,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16562,30 +16762,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16594,7 +16795,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16603,38 +16804,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 69 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -16644,45 +16846,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16690,77 +16893,77 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xA4mLD4rkXZE5Tg5jX-PM_ibHLJxGPZ0dOqagujttYPo= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6k48tNKODxoHFqHvsJzLdXfHVMveYQ1hpLblmE2a0eXQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 53376 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 53376 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -16768,12 +16971,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16781,15 +16984,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16801,30 +17004,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16833,7 +17037,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16842,32 +17046,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 70 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16883,45 +17088,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16929,7 +17135,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xQ9WCQXg3kci4s1k-N1_jON3rjIc87HNykcNj6r53DuQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3X7_irXf-O6GzVKxdiclKJFNBvIMfkuxU-ZfFdCvraBc= BufferLoad: true BufferStore: true CUCount: null @@ -16939,27 +17145,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -16969,50 +17175,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 16 - LSCB: 256 - LSPA: 64 - LSPB: 4 - LVCA: 4 - LVCB: 64 - LVPA: 16 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17020,14 +17226,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -17040,31 +17246,32 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -17081,27 +17288,28 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 71 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 1 @@ -17110,7 +17318,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17122,7 +17330,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -17132,35 +17340,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17168,7 +17377,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3t8AEDJh-qtEP0j1P4XwXYzISUBJp3m20CgLweU5bvQ0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3TVE79Gl-4xyNcHlbhJ0nOncgCUZpRly-oCD4jvG5RmM= BufferLoad: true BufferStore: true CUCount: null @@ -17182,104 +17391,104 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 1 LSCA: 32 LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 28800 LdsInitCVgprs: false - LdsNumBytes: 49664 + LdsNumBytes: 28800 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 16384 LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -17287,23 +17496,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 + NumElementsPerBatchStore: 8 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -17311,7 +17521,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17320,31 +17530,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 72 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -17361,7 +17572,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -17371,24 +17582,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -17398,8 +17609,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17407,7 +17619,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xA1XSikclny7NW89DtILrnItshh0pKUMzTEwj8VAfIF8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nBGszv-xk3760QF81fqkAcHrJvY4h-y0n1eJuTMfXK0= BufferLoad: true BufferStore: true CUCount: null @@ -17417,27 +17629,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -17447,37 +17659,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 128 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 98816 + LdsNumBytes: 57600 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -17487,10 +17699,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17498,14 +17710,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -17518,7 +17730,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -17526,22 +17738,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17559,32 +17772,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 73 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17604,32 +17818,32 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -17639,6 +17853,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17646,7 +17861,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1MdvvF3WuLvW5zfefjrUPKxH3mZUuZSByYW7fuq5zh-k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT18r4dBftj4FCGdnY-zJAIV0H8414jv4jjNP1BBgpXMzg= BufferLoad: true BufferStore: true CUCount: null @@ -17660,23 +17875,23 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -17686,37 +17901,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -17729,7 +17944,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17737,14 +17952,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -17757,7 +17972,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -17765,22 +17980,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17798,32 +18014,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 74 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17843,30 +18060,30 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -17876,8 +18093,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17885,7 +18103,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1LoQT1m4fy0z4hWOafpHqcYDvq8L8GKbCru3r0T1zEI8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zFRfz_ClGyk9R-xSso3dzQXiskbED_8QHeLF4_pDaC4= BufferLoad: true BufferStore: true CUCount: null @@ -17895,27 +18113,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -17925,39 +18143,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC7_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -17966,37 +18184,37 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -18004,22 +18222,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18037,32 +18256,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 75 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC7_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18078,11 +18298,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -18090,24 +18310,24 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -18117,6 +18337,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18124,38 +18345,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16XOuZu64IcidHMClamzevEn0u2nxp4pZshzhk7HOLl9k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VBy1LqzrPAwHi5usd2GrNHxCFiZX3VQhtQmF5HSKuWc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -18164,38 +18385,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 32 + LSPB: 16 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 512 + LVPB: 4 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 41472 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -18204,10 +18425,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18215,15 +18436,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18235,7 +18456,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -18243,22 +18464,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18266,8 +18488,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18276,38 +18498,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 76 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -18320,31 +18543,31 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -18356,6 +18579,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18363,38 +18587,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI323gLHO3RJpkSFxOlGwg-m6naxNwlrKpvPr78QjVKcUOs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1lddss4YYeTrRrhI8j4V2ORHSjVm4oVfckN2DehcfM4Y= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -18403,80 +18627,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120832 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 120832 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -18484,20 +18708,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 5 - NumLoadsB: 8 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18515,32 +18740,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 77 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 2 - ThreadTileA: 80 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18556,45 +18782,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18602,90 +18829,89 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x256x32_MI16x8aOTHAb3oJ24lDR1GY5QlgCdE9ISN9B4sNQoCcICvs4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 47616 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 47616 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 78336 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 47616 - LdsOffsetMetadata_Blk: 78336 - LdsPadA: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18693,15 +18919,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 4] - MIWaveTileA: 5 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 256 - MacroTileA: 80 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18713,30 +18939,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 - NumLoadsB: 8 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18745,7 +18972,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18754,38 +18981,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 78 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 20 + ThreadTileA: 8 ThreadTileB: 4 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -18798,42 +19026,43 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18841,7 +19070,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1uqhzgX0z42t9VByvRAi39cRVaYcY67x6Oj8SvZwousc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AzIzbvcAXxERNGRmUhNYSaITsRVffmpOLjYiPr-GCjU= BufferLoad: true BufferStore: true CUCount: null @@ -18855,76 +19084,76 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 16 + LSCA: 128 LSCB: 64 LSPA: 8 - LSPB: 2 - LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18932,15 +19161,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18952,7 +19181,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -18960,23 +19189,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 4 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18984,7 +19214,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18993,32 +19223,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 79 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19034,34 +19265,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -19069,10 +19300,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19080,7 +19312,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xIqJeydFF1_Tl-klQcK_JXBaUx3vbPPgLlD9LFHD3nOM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1hPJ3j3hCQPLY9poEEQoBk953UZJWKGJ-JycqfOwb3oU= BufferLoad: true BufferStore: true CUCount: null @@ -19090,28 +19322,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -19120,37 +19352,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 128 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 49664 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 49664 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -19158,12 +19390,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19171,15 +19403,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19191,30 +19423,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19232,32 +19465,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 80 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19273,45 +19507,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19319,7 +19554,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32T9xX-njlVFOTQFt-goBtdXkiPVfAmCKuzRDcK3R4fjA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wYVQmjqRgJkX091wD_hlsjrCn05Q-VGlKyxjofUMhb4= BufferLoad: true BufferStore: true CUCount: null @@ -19333,24 +19568,24 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -19359,37 +19594,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB4_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 256 + LSCA: 128 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 1 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 61632 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 61632 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -19402,7 +19637,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19411,14 +19646,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19430,7 +19665,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -19438,22 +19673,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19471,32 +19707,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 81 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB4_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19512,34 +19749,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [128, 2, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -19547,10 +19784,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19558,38 +19796,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x64x32_MI16xZMetFWyGzjhHcbZJekE_6F7yBfklY-t1A7lJBhtt1oY= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1MdhDwvT9IqqF7BfDnreJ8GZMAtVVUbrvEVE-1LA6oQs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -19598,38 +19836,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 3072 - LdsBlockSizePerPadB: 512 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 55808 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 25088 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 55808 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 25088 - LdsOffsetB_Blk: 90624 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 90624 - LdsPadA: 16 + LdsOffsetMetadata: 55808 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -19638,10 +19876,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19650,14 +19888,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 64 - MacroTileA: 192 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19669,7 +19907,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -19679,20 +19917,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19710,32 +19949,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 82 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 4 - ThreadTileA: 12 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19751,11 +19991,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -19766,19 +20006,19 @@ WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -19786,10 +20026,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19797,12 +20038,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x192x32_MI16idrImgWgCO7nvcJQBWTDXiMZm2WoqSF8tVNvZiYJ_9I= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6ljv8NXufAk_cuVEtHqf5txwlBh_uLK29IZNbPQwu2vM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -19811,23 +20052,23 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -19837,10 +20078,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 LSCB: 32 @@ -19850,26 +20091,26 @@ LVCB: 8 LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -19877,38 +20118,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 6] - MIWaveTileA: 6 - MIWaveTileB: 6 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -19916,22 +20157,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19940,7 +20182,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19949,32 +20191,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 83 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 6 - ThreadTileA: 24 - ThreadTileB: 6 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20002,33 +20245,34 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20036,7 +20280,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1VZJZ9S4ao24zPbzU9SV6fXZLWTiSOVG8G43480XLTao= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2zGGrW_n4jQYF6klG2DYYqkW_uppB7Bvv5hVyqEj8-a4= BufferLoad: true BufferStore: true CUCount: null @@ -20046,108 +20290,108 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB1_NTC6_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 64 - LSPA: 8 - LSPB: 2 - LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 140416 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 140416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 37440 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 70208 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 102976 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 102976 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 9] + MIWaveTileA: 2 + MIWaveTileB: 9 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -20155,23 +20399,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 9 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 9 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20179,7 +20424,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20188,32 +20433,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 84 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB1_NTC6_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 9 + ThreadTileA: 32 + ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20229,45 +20475,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20275,7 +20522,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x1MslfBQ8HXWroQ8UfCHLcp_snVXSVDA54YqmEx6V5Oes= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2L8kdKU9hAEZTTngWUokj3CFx3UF0ntgM4z3Px34u56w= BufferLoad: true BufferStore: true CUCount: null @@ -20285,131 +20532,132 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 163840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 163840 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 128 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20427,18 +20675,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 85 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -20448,11 +20696,12 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20468,45 +20717,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20514,7 +20764,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xSDxhcZ3u4iSYJMHirZW1CTHGblCpuWSDzLc_0f9fOAo= BufferLoad: true BufferStore: true CUCount: null @@ -20525,27 +20774,27 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -20554,11 +20803,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 0 LSCA: 128 LSCB: 64 LSPA: 8 @@ -20568,13 +20817,13 @@ LVPA: 2 LVPB: 4 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 114944 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -20583,21 +20832,21 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 + LdsOffsetMetadata: 49664 LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20605,9 +20854,9 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 @@ -20625,30 +20874,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 8 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20666,36 +20916,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 86 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -20707,45 +20958,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20753,78 +21005,78 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16vi3ZztTU2ZooOHejWDnWTKb_gBIj8TaDNK9DtMbhZNs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4pfu4mZMFXl6Px_rOlheAAHmYZJRxgXzi2LZPhoJtkgQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 32 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -20833,10 +21085,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20844,15 +21096,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20864,7 +21116,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -20872,22 +21124,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20896,7 +21149,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20905,38 +21158,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 87 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -20949,42 +21203,43 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 2 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20992,7 +21247,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3Yo9YdRBKBj79FhGiWwDokhFsmBdvtYGmzRh04XflZAM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6XOuRIWbxGpO8PYAxal26328SbSHHkRpiiwDFpFzd5cE= BufferLoad: true BufferStore: true CUCount: null @@ -21002,27 +21257,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -21032,37 +21287,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 LSCA: 64 - LSCB: 32 + LSCB: 128 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49280 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 49280 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -21072,10 +21327,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21103,7 +21358,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -21111,22 +21366,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 8 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21144,18 +21400,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 88 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSwapAddr: true + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -21165,6 +21421,7 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 ThreadTile1: 1 @@ -21197,33 +21454,34 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21231,38 +21489,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xduzYOnXWtXrhkDsB36ISXJDAIsEQJoO-tbdr76LoaHU= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZiLNE3I76KjxLJJ0ZoVjNTn_UpkwS6GssxpuNB9QjBg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -21271,80 +21529,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 53312 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -21352,20 +21610,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21374,7 +21633,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21383,38 +21642,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 89 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21424,45 +21684,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21470,37 +21731,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x160x32_MI16xPdajT8YH9ob0TRWzD1ldbfXmsBG0Yb-qwhjBqEUffYg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1_c9FQ4VBUegtbjamFxhptq0xUR4EgDEVCIVltcTf-gY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -21510,101 +21771,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 1536 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38400 + LdsBytesNoAmax: 57472 LdsInitCVgprs: false - LdsNumBytes: 38400 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 57472 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 78336 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38400 - LdsOffsetMetadata_Blk: 78336 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 5] - MIWaveTileA: 3 - MIWaveTileB: 5 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 160 - MacroTileA: 96 - MacroTileB: 160 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 3 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 3 - NumLoadsB: 5 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21613,7 +21875,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21622,32 +21884,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 90 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 5 - ThreadTileA: 12 - ThreadTileB: 5 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21663,45 +21926,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21709,37 +21973,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16D6x4dco45Qa1J1WWaOfvrVhnUVbkL6MHiGgtcXWPe0g= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6n7nm_pljv8X1U0nWk_dzuTQG8XL_nFBv6qipTuQt2AM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -21749,37 +22013,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 33792 + LdsNumBytes: 57600 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -21787,12 +22051,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21800,15 +22064,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 8] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 8 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21820,30 +22084,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21851,8 +22116,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21861,32 +22126,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 91 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 1 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21906,41 +22172,42 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21948,12 +22215,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16g49CyeI2eVWDPXKcFHBWWzGtiVFxp9FPEDK9D-TTnuI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6BuOegd-KKwi1l2fNCAczR3-4lBccw6pagnIcRlJE9nQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -21962,24 +22229,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -21988,39 +22255,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 2560 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38400 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 38400 - LdsNumElementsAlignedA: 20992 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20992 - LdsOffsetB_Blk: 86528 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38400 - LdsOffsetMetadata_Blk: 86528 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -22028,38 +22295,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 160 + MacroTileA: 64 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -22067,22 +22334,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 - NumLoadsCoalescedA: 5 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22091,7 +22359,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22100,32 +22368,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 92 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22141,34 +22410,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -22178,8 +22447,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22187,12 +22457,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1A-I78gXXrdOMKV_Cgj6SLbZMW0uS0cuPNCPj32U4D4A= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1RK0QdtuAoNj9ZBkBIF8v6fjg0VzdTSnPVrZGC6eM97o= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -22201,24 +22471,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -22227,39 +22497,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 128 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9216 + LdsBytesNoAmax: 83968 LdsInitCVgprs: false - LdsNumBytes: 9216 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 83968 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 147456 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 83968 + LdsOffsetMetadata_Blk: 147456 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -22267,38 +22537,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -22306,31 +22576,32 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 4 - NonTemporalC: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22339,32 +22610,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 93 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22380,45 +22652,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22426,37 +22699,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1jVgAbVl7TAJozgubbJrw62tnMG_eow8t5K7k9PPC4WA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1sFatWmJdqohNPgHurGIn0vTYTspu7sKSxkns5-iI-Zo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -22466,78 +22739,78 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 8704 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 8704 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -22545,23 +22818,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22578,32 +22852,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 94 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22619,45 +22894,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22665,91 +22941,91 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1TJ8fhKzIV0bLTU9VMa_LELZqPo5QayYI3rGq2ReOzWM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9D_9Y2APT5jU_WNRBvEqFeqYqHamCgP7R_Hfo8HPEpQQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14848 + LdsBytesNoAmax: 61696 LdsInitCVgprs: false - LdsNumBytes: 14848 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 61696 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 45056 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14848 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 45056 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -22757,26 +23033,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveTile: [3, 1] + MIWaveTileA: 3 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -22785,21 +23061,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22817,38 +23094,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 95 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 48 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 48 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -22858,45 +23136,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22904,90 +23183,90 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x32_MI16x1hlDZ2RQXNBkW0XTWcWRLJPdhG2QWccTGpKZ8d7Cm7gs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6NchRY6fE5EUP0BO9TxVrOOaZY2mNScnULQhBoZehEAg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 19456 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 19456 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 36864 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 19456 - LdsOffsetMetadata_Blk: 36864 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22995,15 +23274,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 3] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 96 - MacroTileA: 32 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23015,31 +23294,32 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 128 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -23047,7 +23327,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23056,32 +23336,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 96 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 3 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23101,41 +23382,42 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23143,7 +23425,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x32_MI16xiJ6qFagB646z5Vtl5Hvm-aiCfDXZyxNPc89xWsDkpZY= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT63kDVUoqiyQ9zT3dXF-7c6slWQADFta39wl6nbJ5cQgQ= BufferLoad: true BufferStore: true CUCount: null @@ -23153,80 +23435,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 768 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 64768 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 64768 - LdsNumElementsAlignedA: 6656 - LdsNumElementsAlignedB: 25344 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 6656 - LdsOffsetB_Blk: 39424 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 6656 - LdsOffsetMetadata_Blk: 39424 - LdsPadA: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23234,15 +23516,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 192 - MacroTileA: 48 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23254,30 +23536,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23286,7 +23569,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23295,32 +23578,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 97 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23336,246 +23620,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x384x32_MI16xxRLiYtk8vs0qL-PXaqBrMaldMaCGmntAX1C6Jqy-srA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62976 - LdsInitCVgprs: false - LdsNumBytes: 62976 - LdsNumElementsAlignedA: 7680 - LdsNumElementsAlignedB: 55296 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 7680 - LdsOffsetB_Blk: 73216 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 62976 - LdsOffsetMetadata_Blk: 73216 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 6] - MIWaveTileA: 3 - MIWaveTileB: 6 - MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 384 - MacroTileA: 48 - MacroTileB: 384 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 98 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 6 - ThreadTileA: 12 - ThreadTileB: 6 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -23585,35 +23630,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23621,38 +23667,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI16x1J1zmPJrEydRWke4dDOrYNixyg3y7uYK8qYrzIoZUnhg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT33tdZ_Ane8GWNMOH9Yy9Tto67iQRR3GWjHCXudpxIhQA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -23661,27 +23707,27 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB2_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 + LdsBytesNoAmax: 28800 LdsInitCVgprs: false - LdsNumBytes: 12800 + LdsNumBytes: 28800 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 @@ -23690,7 +23736,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12800 + LdsOffsetMetadata: 8192 LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 8 @@ -23699,12 +23745,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23713,14 +23759,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23732,30 +23778,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23763,8 +23810,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23772,19 +23819,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 99 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB2_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -23794,11 +23841,12 @@ SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23817,8 +23865,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -23826,33 +23874,34 @@ WavefrontSize: 64 WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23860,37 +23909,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1ykZbW8kjS_Mwo8JgpLmIp5amBOKXc9UByonlcmoBU14= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6B6yjCgjH545xofiAKGkklYZ8gjK9Z83FXrAiImiJTZs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -23900,11 +23949,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 64 LSCB: 64 LSPA: 16 @@ -23914,36 +23963,36 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 31232 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 31232 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31232 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23952,14 +24001,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 48 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 48 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23971,33 +24020,32 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 3 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24013,19 +24061,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 100 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC8_WGMXCCGn1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -24035,11 +24083,12 @@ SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24053,50 +24102,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24104,37 +24151,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x128x32_MI32xcsdRVX0ybSkhmdCuVjb4BJMOuJqgW1kHyForr86LIZ4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kJnpDlmDlWwyslngZ-EgItmmqUmgLMBdJbRM__aOPpk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -24144,50 +24191,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65024 + LdsBytesNoAmax: 53312 LdsInitCVgprs: false - LdsNumBytes: 65024 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13824 - LdsOffsetMetadata_Blk: 46592 - LdsPadA: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24195,15 +24242,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24215,9 +24262,9 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -24225,23 +24272,22 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24257,39 +24303,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 101 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 48 + ThreadTileA: 16 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -24297,12 +24344,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -24312,35 +24356,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24348,12 +24393,11 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT320x160x32_MI16gSO2xcUFkAwgSU8xxy1gJJLLVRqWr-K3o6BUJWoBh3E= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -24362,24 +24406,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -24388,11 +24432,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5120_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 64 LSCB: 32 LSPA: 16 @@ -24401,26 +24445,26 @@ LVCB: 8 LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 5120 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 64512 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 64512 - LdsNumElementsAlignedA: 41472 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 41472 - LdsOffsetB_Blk: 107008 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 64512 - LdsOffsetMetadata_Blk: 107008 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -24428,38 +24472,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [5, 10] - MIWaveTileA: 5 - MIWaveTileB: 10 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 160 - MacroTileA: 320 - MacroTileB: 160 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -24470,22 +24514,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 200 - NumGlobalWriteVectorsPerThread: 200 - NumLoadsA: 10 - NumLoadsB: 5 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24493,7 +24536,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24501,33 +24544,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 102 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5120_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 10 - ThreadTileA: 20 - ThreadTileB: 10 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24541,41 +24585,38 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -24585,6 +24626,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24592,7 +24634,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3tgew_wmZ1d78svsdgjmhnAmiMolGNWzJ-VQ5Kmkq7s0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT34wSAq7cxuMKvQcmOPfg83xDDcTX3QGS-2yBFKS0j_is= BufferLoad: true BufferStore: true CUCount: null @@ -24602,27 +24644,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -24632,75 +24674,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98560 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 98560 + LdsNumBytes: 57600 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -24712,24 +24754,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24745,32 +24785,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 103 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -24785,12 +24826,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -24800,24 +24838,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -24829,6 +24867,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24836,7 +24875,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xsSK0i4V19YfEXRCT7lci-LV6L-p6I3IHEgHrfhVIEvM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1gnSdpgupEOvJOiRh45yYCWk8ntMiLyYPoi-roHcedwc= BufferLoad: true BufferStore: true CUCount: null @@ -24850,24 +24889,24 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -24876,11 +24915,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 LSCA: 128 LSCB: 64 LSPA: 8 @@ -24889,7 +24928,7 @@ LVCB: 16 LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 114944 @@ -24911,15 +24950,15 @@ LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24927,9 +24966,9 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 @@ -24947,7 +24986,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -24956,24 +24995,23 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 8 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24989,32 +25027,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 104 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -25029,41 +25068,38 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -25073,6 +25109,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25080,7 +25117,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xFcTBM0p4-B4u4RHcDFjzYdbTkiu3QTU3Jb2iD88fU9Q= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Puz217WUimK6XrH1Pxjc7FrKKVyACIt5zgqPXoHkRJM= BufferLoad: true BufferStore: true CUCount: null @@ -25094,65 +25131,65 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 24704 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 24704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -25160,38 +25197,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -25200,24 +25237,23 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25233,33 +25269,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 105 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25273,50 +25310,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25324,7 +25359,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xd5KB-3Wsu99IEswXfBw5F2zAdcz0rxdn-nsREODY4wE= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WZ5hxp8HqtUBdolum7CrEAeNqQRa5V_wUt1YK_HUitM= BufferLoad: true BufferStore: true CUCount: null @@ -25334,134 +25369,132 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 16 + LSCB: 256 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 37376 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25477,33 +25510,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 106 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25517,50 +25551,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25568,12 +25600,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16x8uILL1fCyh1qTBHYDw8Fhlq-ej8sl5xDCmV3PfmJi3g= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Jf69jrN9Z4-iQ2t2y60MsQTFsDB9_gIxrHRzz2oaiiQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -25582,24 +25614,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -25608,38 +25640,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 LSCB: 64 - LSPA: 8 + LSPA: 64 LSPB: 16 - LVCA: 32 + LVCA: 4 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 4 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 23552 LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 23552 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49664 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 + LdsOffsetMetadata: 23552 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -25651,7 +25683,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -25659,14 +25691,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -25679,7 +25711,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -25688,24 +25720,23 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalB: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25721,33 +25752,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 107 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25761,41 +25793,38 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -25805,6 +25834,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25812,7 +25842,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xRSQp-nzPx9YJcjSE6rpJqJMr777j07qBoTYEz5pwMuc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6o6Mqj-RUjXtkwZ1K2RrFXlC3Q-a2y2vEOhYluZ4pCok= BufferLoad: true BufferStore: true CUCount: null @@ -25826,24 +25856,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -25852,11 +25882,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -25865,26 +25895,26 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50176 + LdsBytesNoAmax: 25600 LdsInitCVgprs: false - LdsNumBytes: 50176 + LdsNumBytes: 25600 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50176 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -25892,11 +25922,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -25904,26 +25934,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -25931,32 +25961,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -25965,37 +25994,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 108 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -26005,41 +26035,38 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -26049,6 +26076,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26056,12 +26084,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32jCbGOzCWj1ZReUDKW2CBof4sEzcT56K5GViFAN354XE= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT20mCDNMpt7viEEIOBxhxMtxX9Mc4gjN024wHUZQhAVXc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -26070,24 +26098,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -26096,39 +26124,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 LSCB: 64 - LSPA: 8 + LSPA: 32 LSPB: 16 - LVCA: 32 + LVCA: 8 LVCB: 16 - LVPA: 2 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 163840 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 189440 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 163840 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 189440 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -26136,11 +26164,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -26148,26 +26176,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -26176,24 +26204,23 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalB: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 7 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26209,37 +26236,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 109 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -26249,41 +26277,38 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -26293,6 +26318,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26300,12 +26326,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x10774ejW-y_fCHxcslzP6G-lYIY7kmGqxfZavG0O9sDI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yFcFPlsGw5B8R9atFrCRXX3pXqiO9SpMqMMeOGD1qxE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -26314,63 +26340,63 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 LSCA: 16 LSCB: 64 - LSPA: 64 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 4 - LdsBlockSizePerPadA: 768 + LSPA: 4 + LSPB: 1 + LVCA: 16 + LVCB: 64 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 64512 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 64512 - LdsNumElementsAlignedA: 13312 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13312 - LdsOffsetB_Blk: 46080 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13312 - LdsOffsetMetadata_Blk: 46080 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 @@ -26383,7 +26409,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26391,15 +26417,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26411,7 +26437,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -26426,18 +26452,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26453,32 +26478,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 110 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 12 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -26493,12 +26519,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -26508,24 +26531,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -26537,6 +26560,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26544,115 +26568,115 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32JxH2KKlBlMybG2dSQRlSPdHkRx_8g5fo45X9py5Jl3I= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KLwYNiZFx8XC0SNCEsNLwUVAoXl9bsjrmRA1WIofO6w= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LSCA: 16 + LSCB: 256 + LSPA: 16 + LSPB: 1 + LVCA: 16 + LVCB: 256 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 68096 + LdsBytesNoAmax: 37376 LdsInitCVgprs: false - LdsNumBytes: 68096 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 68096 - LdsOffsetMetadata_Blk: 164352 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -26663,25 +26687,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26697,39 +26719,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 111 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 64 + ThreadTileA: 4 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -26737,39 +26760,36 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -26779,8 +26799,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26788,7 +26809,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xi2Jigudy-18K0UMT5ct2MdW8qfs4vqYPQZeulKeB3FE= BufferLoad: true BufferStore: true CUCount: null @@ -26802,24 +26822,24 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -26828,10 +26848,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 LDSTrInst: 1 LSCA: 128 LSCB: 64 @@ -26841,7 +26861,7 @@ LVCB: 16 LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 114944 @@ -26863,15 +26883,15 @@ LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26879,9 +26899,9 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 @@ -26910,13 +26930,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 8 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -26924,8 +26944,6 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26941,32 +26959,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 112 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM2_WGMXCC32_WGMXCCGn1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -26981,39 +26999,36 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -27021,10 +27036,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27032,7 +27048,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32x3gfKViGSlyTzbeVb2aUOzEk6CS8J7voKdng2XPN-XE4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6x3uoXhTokfgXutkdVLR7kOydsFIqKOj0o71H_2PuOx8= BufferLoad: true BufferStore: true CUCount: null @@ -27042,28 +27058,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -27072,50 +27088,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 49664 + LdsNumBytes: 132096 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 66048 LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49664 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27123,14 +27139,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -27145,7 +27161,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -27153,30 +27169,28 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -27185,32 +27199,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 113 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 + StreamKXCCMapping: 0 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -27225,50 +27239,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27276,37 +27288,36 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32pZ9lLvhCdmH7RpWQsDwDXdHIV2y4SsFgkBGs2DezxyQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -27316,37 +27327,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 66560 + LdsNumBytes: 114944 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 + LdsOffsetA_Blk: 65536 LdsOffsetB: 32768 - LdsOffsetB_Blk: 163840 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 163840 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -27354,12 +27365,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27367,14 +27378,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 256 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 256 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -27389,7 +27400,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -27397,23 +27408,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -27429,33 +27438,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 114 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27469,50 +27478,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27520,7 +27527,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x64_MI16x08S02Y53B0Ne6ocNhqpSHhbrCU_jARBa0pnTDvEPOy4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjSUpYYGuR7UmIaRfL8rbbKYy6b1BuxrRuEwINN-ueM= BufferLoad: true BufferStore: true CUCount: null @@ -27531,27 +27538,27 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -27560,50 +27567,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 LSCB: 64 LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LSPB: 1 + LVCA: 4 + LVCB: 64 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 76288 + LdsBytesNoAmax: 25088 LdsInitCVgprs: false - LdsNumBytes: 76288 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 76288 - LdsOffsetMetadata_Blk: 139776 - LdsPadA: 8 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27611,15 +27618,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27633,31 +27640,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 4 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 16 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -27673,39 +27678,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 115 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -27713,279 +27718,302 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [4, 30, 8192, 128] - - [60, 0.0] + - [29, 0.0] - - [16, 33, 8192, 128] - - [61, 0.0] + - [106, 8.99] - - [40, 61, 8192, 128] - [0, 0.0] - - [128, 17711, 1, 960] - - [1, 0.0] + - [82, 58.48] - - [128, 17711, 1, 2480] - - [2, 0.0] + - [1, 0.0] - - [252, 17711, 1, 128] - - [3, 0.0] + - [81, 39.0] - - [256, 17711, 1, 128] - - [4, 0.0] + - [2, 0.0] - - [384, 246, 1, 17711] - - [82, 0.0] + - [36, 0.0] - - [384, 768, 1, 17711] - - [83, 0.0] + - [37, 0.0] - - [928, 17711, 1, 128] - - [101, 103696.0] + - [65, 59.92] - - [2732, 17711, 1, 384] - - [102, 179908.0] + - [64, 90.61] - - [6, 128, 17711, 41] - - [5, 0.0] + - [3, 0.0] - - [20, 124, 17711, 48] - - [6, 0.0] + - [4, 0.0] - - [41, 6, 17711, 128] - - [7, 0.0] + - [5, 0.0] - - [256, 256, 41, 17711] - - [8, 0.0] + - [6, 0.0] - - [1, 1, 1, 4096] - - [9, 0.0] + - [109, 0.0] - - [1, 4096, 1, 256] - - [64, 0.0] + - [105, 0.18] - - [1, 4096, 1, 512] - - [65, 0.0] + - [105, 0.3] - - [28, 4096, 1, 256] - - [66, 0.0] + - [102, 4.54] - - [28, 4096, 1, 320] - - [10, 0.0] + - [98, 5.04] - - [57, 262144, 1, 32] - - [67, 0.0] + - [30, 0.0] - - [64, 102400, 1, 64] - - [68, 0.0] + - [83, 24.71] - - [64, 131072, 1, 64] - - [69, 0.0] + - [31, 0.0] - - [64, 131072, 1, 128] - - [70, 0.0] + - [101, 36.73] - - [64, 819200, 1, 64] - - [11, 0.0] + - [83, 29.19] - - [72, 4096, 1, 256] - - [12, 0.0] + - [99, 9.21] - - [72, 4096, 1, 320] - - [13, 0.0] + - [97, 10.63] - - [82, 262144, 1, 32] - - [14, 0.0] + - [95, 16.68] - - [116, 4096, 1, 256] - - [15, 0.0] + - [7, 0.0] - - [116, 4096, 1, 320] - - [16, 0.0] + - [8, 0.0] - - [128, 4096, 1, 2048] - - [17, 0.0] + - [96, 44.23] - - [128, 131072, 1, 64] - - [76, 0.0] + - [34, 0.0] - - [160, 655360, 1, 10] - - [77, 0.0] + - [66, 7.73] - - [180, 4096, 1, 256] - - [18, 0.0] + - [9, 0.0] - - [180, 4096, 1, 320] - - [110, 47842.7] + - [87, 21.73] - - [192, 655360, 1, 48] - - [19, 0.0] + - [94, 34.07] - - [192, 655360, 1, 112] - - [20, 0.0] + - [59, 52.18] - - [224, 527553, 1, 64] - - [21, 0.0] + - [84, 39.78] - - [224, 752863, 1, 64] - - [22, 0.0] + - [85, 41.01] - - [256, 1, 1, 4096] - - [79, 0.0] + - [110, 0.1] - - [256, 4096, 1, 28] - - [23, 0.0] + - [91, 4.17] - - [256, 4096, 1, 72] - - [24, 0.0] + - [100, 9.58] - - [256, 4096, 1, 116] - - [25, 0.0] + - [89, 13.24] - - [256, 4096, 1, 180] - - [26, 0.0] + - [89, 17.83] - - [256, 4096, 1, 256] - - [27, 0.0] + - [88, 26.1] - - [256, 4096, 1, 7680] - - [81, 0.0] + - [113, 86.69] - - [288, 806154, 1, 64] - - [28, 0.0] + - [93, 40.27] - - [512, 1, 1, 4096] - - [84, 0.0] + - [110, 0.2] - - [512, 4096, 1, 1] - - [29, 0.0] + - [92, 0.27] - - [512, 4096, 1, 160] - - [30, 0.0] + - [90, 31.19] - - [512, 4096, 1, 512] - - [31, 0.0] + - [86, 54.79] - - [512, 4096, 1, 2246] - - [32, 0.0] + - [103, 78.03] - - [512, 4096, 1, 9216] - - [87, 0.0] + - [39, 0.0] - - [512, 4096, 1, 30816] - - [33, 0.0] + - [10, 0.0] - - [1600, 4096, 1, 128] - - [89, 0.0] + - [67, 52.68] - - [1824, 4096, 1, 2048] - - [90, 0.0] + - [41, 0.0] - - [2048, 4096, 1, 57] - - [34, 0.0] + - [67, 29.92] - - [2048, 4096, 1, 64] - - [111, 59310.0] + - [52, 59310.0] - - [2048, 4096, 1, 82] - - [91, 0.0] + - [42, 0.0] - - [2048, 4096, 1, 160] - - [35, 0.0] + - [67, 64.51] - - [2048, 4096, 1, 2048] - - [36, 0.0] + - [11, 0.0] - - [2246, 4096, 1, 2048] - - [37, 0.0] + - [12, 0.0] - - [2560, 4096, 1, 4096] - - [92, 0.0] + - [43, 0.0] - - [2624, 4096, 1, 2048] - - [38, 0.0] + - [68, 115.68] - - [25, 25, 8192, 32] - - [93, 0.0] + - [44, 0.0] - - [32, 25, 8192, 25] - - [94, 0.0] + - [45, 0.0] - - [32, 57, 4096, 64] - - [95, 0.0] + - [46, 0.0] - - [32, 82, 4096, 64] - - [96, 0.0] + - [47, 0.0] - - [48, 192, 4096, 160] - - [97, 0.0] + - [48, 0.0] - - [48, 642, 4096, 160] - - [98, 0.0] + - [49, 0.0] - - [64, 32, 4096, 200] - - [99, 0.0] + - [107, 22.3] - - [200, 32, 4096, 64] - - [39, 0.0] + - [108, 15.84] - - [256, 2048, 1, 128] - - [40, 0.0] + - [13, 0.0] - - [512, 2048, 1, 14336] - - [41, 0.0] + - [14, 0.0] - - [1024, 2048, 1, 128] - - [88, 0.0] + - [40, 0.0] - - [1024, 2048, 1, 14336] - - [42, 0.0] + - [15, 0.0] - - [1, 8192, 1, 128] - - [43, 0.0] + - [16, 0.0] - - [1, 8192, 1, 256] - - [44, 0.0] + - [17, 0.0] - - [120, 8192, 1, 256] - - [103, 52872.0] + - [50, 52872.0] - - [128, 1, 1, 8192] - - [45, 0.0] + - [114, 0.1] - - [128, 8192, 1, 256] - - [46, 0.0] + - [18, 0.0] - - [128, 8192, 1, 2440] - - [47, 0.0] + - [74, 53.83] - - [128, 8192, 1, 5120] - - [48, 0.0] + - [19, 0.0] - - [128, 8192, 1, 5640] - - [49, 0.0] + - [20, 0.0] - - [256, 1, 1, 8192] - - [50, 0.0] + - [21, 0.0] - - [256, 8192, 1, 512] - - [105, 124161.0] + - [77, 54.43] - - [256, 8192, 1, 528] - - [106, 117688.0] + - [78, 50.73] - - [256, 8192, 1, 2048] - - [51, 0.0] + - [79, 71.81] - - [256, 98304, 1, 128] - - [52, 0.0] + - [22, 0.0] - - [512, 8192, 1, 120] - - [107, 69570.3] + - [80, 35.22] - - [512, 8192, 1, 512] - - [53, 0.0] + - [23, 0.0] - - [512, 8192, 1, 528] - - [54, 0.0] + - [24, 0.0] - - [512, 8192, 1, 1980] - - [55, 0.0] + - [25, 0.0] - - [512, 8192, 1, 2048] - - [56, 0.0] + - [26, 0.0] - - [512, 8192, 1, 3072] - - [57, 0.0] + - [27, 0.0] - - [528, 8192, 1, 256] - - [58, 0.0] + - [28, 0.0] - - [10880, 8192, 1, 128] - - [59, 0.0] + - [63, 65.6] - - [1, 1024, 1, 128] - - [62, 0.0] + - [104, 0.02] - - [1, 4096, 1, 1] - - [63, 0.0] + - [104, 0.0] - - [128, 1, 1, 1024] - - [71, 0.0] + - [32, 0.0] - - [128, 41, 1, 17711] - - [72, 0.0] + - [33, 0.0] - - [128, 1024, 1, 128] - - [73, 0.0] + - [72, 2.92] - - [128, 1024, 1, 4096] - - [74, 0.0] + - [69, 28.91] - - [128, 1024, 1, 7456] - - [75, 0.0] + - [112, 39.35] - - [128, 17711, 1, 128] - - [100, 57292.0] + - [82, 25.45] - - [233, 131072, 1, 56] - - [78, 0.0] + - [35, 0.0] - - [256, 1024, 1, 128] - - [80, 0.0] + - [70, 5.57] - - [512, 1024, 1, 128] - - [85, 0.0] + - [73, 9.94] - - [512, 1024, 1, 2011] - - [86, 0.0] + - [38, 0.0] - - [4096, 1024, 1, 128] - - [113, 83849.7] + - [55, 44.33] - - [32, 233, 1024, 128] - - [115, 53858.6] + - [54, 53858.6] - - [256, 8192, 1, 256] - - [104, 88136.7] + - [77, 40.67] - - [512, 8192, 1, 256] - - [108, 122522.0] + - [51, 122522.0] - - [1024, 8192, 1, 512] - - [109, 187269.0] + - [57, 95.36] - - [2011, 1024, 1, 512] - - [112, 117998.0] + - [56, 55.01] - - [7968, 1024, 1, 256] - - [114, 135836.0] + - [53, 135836.0] + - - [3072, 8192, 1, 512] + - [58, 103.91] + - - [4352, 8192, 1, 256] + - [59, 87.22] + - - [4608, 8192, 1, 256] + - [60, 86.41] + - - [5120, 8192, 1, 128] + - [61, 63.17] + - - [5640, 8192, 1, 128] + - [62, 58.59] + - - [7296, 8192, 1, 128] + - [60, 65.76] + - - [4132, 4096, 1, 256] + - [66, 77.55] + - - [4132, 4096, 1, 512] + - [66, 98.72] + - - [128, 1024, 1, 1] + - [71, 0.03] + - - [256, 8192, 1, 1] + - [75, 0.28] + - - [256, 8192, 1, 120] + - [76, 22.65] + - - [256, 4096, 1, 1] + - [91, 0.15] + - - [256, 1024, 1, 7968] + - [111, 59.32] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml new file mode 100644 index 00000000000..6b0e5a39bae --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml @@ -0,0 +1,14733 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 0058] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DataTypeA: 0 + DataTypeAmaxD: 0 + DataTypeB: 0 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 10 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6IhMdVhMfTI3xBN4uSqbJxvOYzvtnSVMCVX0u78IWd44= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6JGSVgpt4GrxgPWh0ngWHRKLbLQ_tqOXKZqa8Lb0Ms-k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1tZVQklGtKOQ3p4IgWyAMn9jTY4tlNPIDxC3Y71614zM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123904 + LdsInitCVgprs: false + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19w6888cdHAWM4y5NYJiddGc0xmSYSG1iOCD6RgfOFHM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2ej76CXhxc1HCjfC8xyOXAnhi0iATAwRTzc4u3zxpLfA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148736 + LdsInitCVgprs: false + LdsNumBytes: 148736 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 41600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74368 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 107136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 107136 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 5 + ThreadTileA: 64 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2l9pCGbOtMUbslloqKGQiaMAYUl03KI_tqPnmAtre-YA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130560 + LdsInitCVgprs: false + LdsNumBytes: 130560 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2bO0CyY1hfbzTfbroYwjGkYL-2OnycNC5Ws1vnO03-EM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 132096 + LdsInitCVgprs: false + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2hoMDmZeEsYCyvKLfTCHSZhNF7OrTU2H4lTaznjnrQWU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124416 + LdsInitCVgprs: false + LdsNumBytes: 124416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHeN2A1e9y-sybm4-VD6Rl-mXAAhE2oR_OYb0CV2Kb8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kPOJu3SjIZ5ID-BcDpMScvXSs_-HeTA7aISuMEhdHHE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1CVWcLkzUFVZguyEpkQVePxOfwkSx0oBkGlgi_Q3qok0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123904 + LdsInitCVgprs: false + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6k48tNKODxoHFqHvsJzLdXfHVMveYQ1hpLblmE2a0eXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53376 + LdsInitCVgprs: false + LdsNumBytes: 53376 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3X7_irXf-O6GzVKxdiclKJFNBvIMfkuxU-ZfFdCvraBc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30208 + LdsInitCVgprs: false + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3TVE79Gl-4xyNcHlbhJ0nOncgCUZpRly-oCD4jvG5RmM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 28800 + LdsInitCVgprs: false + LdsNumBytes: 28800 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nBGszv-xk3760QF81fqkAcHrJvY4h-y0n1eJuTMfXK0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT18r4dBftj4FCGdnY-zJAIV0H8414jv4jjNP1BBgpXMzg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zFRfz_ClGyk9R-xSso3dzQXiskbED_8QHeLF4_pDaC4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VBy1LqzrPAwHi5usd2GrNHxCFiZX3VQhtQmF5HSKuWc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41472 + LdsInitCVgprs: false + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1lddss4YYeTrRrhI8j4V2ORHSjVm4oVfckN2DehcfM4Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AzIzbvcAXxERNGRmUhNYSaITsRVffmpOLjYiPr-GCjU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 133120 + LdsInitCVgprs: false + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1hPJ3j3hCQPLY9poEEQoBk953UZJWKGJ-JycqfOwb3oU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49664 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wYVQmjqRgJkX091wD_hlsjrCn05Q-VGlKyxjofUMhb4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61632 + LdsInitCVgprs: false + LdsNumBytes: 61632 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12480 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1MdhDwvT9IqqF7BfDnreJ8GZMAtVVUbrvEVE-1LA6oQs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55808 + LdsInitCVgprs: false + LdsNumBytes: 55808 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55808 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6ljv8NXufAk_cuVEtHqf5txwlBh_uLK29IZNbPQwu2vM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2zGGrW_n4jQYF6klG2DYYqkW_uppB7Bvv5hVyqEj8-a4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 140416 + LdsInitCVgprs: false + LdsNumBytes: 140416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 37440 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 70208 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 102976 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 102976 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 9] + MIWaveTileA: 2 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 9 + ThreadTileA: 32 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2L8kdKU9hAEZTTngWUokj3CFx3UF0ntgM4z3Px34u56w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 163840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49664 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4pfu4mZMFXl6Px_rOlheAAHmYZJRxgXzi2LZPhoJtkgQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 31744 + LdsInitCVgprs: false + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6XOuRIWbxGpO8PYAxal26328SbSHHkRpiiwDFpFzd5cE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 132096 + LdsInitCVgprs: false + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZiLNE3I76KjxLJJ0ZoVjNTn_UpkwS6GssxpuNB9QjBg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53312 + LdsInitCVgprs: false + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1_c9FQ4VBUegtbjamFxhptq0xUR4EgDEVCIVltcTf-gY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57472 + LdsInitCVgprs: false + LdsNumBytes: 57472 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8320 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6n7nm_pljv8X1U0nWk_dzuTQG8XL_nFBv6qipTuQt2AM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6BuOegd-KKwi1l2fNCAczR3-4lBccw6pagnIcRlJE9nQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1RK0QdtuAoNj9ZBkBIF8v6fjg0VzdTSnPVrZGC6eM97o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 83968 + LdsInitCVgprs: false + LdsNumBytes: 83968 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 147456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 83968 + LdsOffsetMetadata_Blk: 147456 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1sFatWmJdqohNPgHurGIn0vTYTspu7sKSxkns5-iI-Zo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9D_9Y2APT5jU_WNRBvEqFeqYqHamCgP7R_Hfo8HPEpQQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61696 + LdsInitCVgprs: false + LdsNumBytes: 61696 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 45056 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 45056 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 1 + ThreadTileA: 48 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6NchRY6fE5EUP0BO9TxVrOOaZY2mNScnULQhBoZehEAg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 132096 + LdsInitCVgprs: false + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT63kDVUoqiyQ9zT3dXF-7c6slWQADFta39wl6nbJ5cQgQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT33tdZ_Ane8GWNMOH9Yy9Tto67iQRR3GWjHCXudpxIhQA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 28800 + LdsInitCVgprs: false + LdsNumBytes: 28800 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6B6yjCgjH545xofiAKGkklYZ8gjK9Z83FXrAiImiJTZs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kJnpDlmDlWwyslngZ-EgItmmqUmgLMBdJbRM__aOPpk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53312 + LdsInitCVgprs: false + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT34wSAq7cxuMKvQcmOPfg83xDDcTX3QGS-2yBFKS0j_is= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1gnSdpgupEOvJOiRh45yYCWk8ntMiLyYPoi-roHcedwc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114944 + LdsInitCVgprs: false + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Puz217WUimK6XrH1Pxjc7FrKKVyACIt5zgqPXoHkRJM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24704 + LdsInitCVgprs: false + LdsNumBytes: 24704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WZ5hxp8HqtUBdolum7CrEAeNqQRa5V_wUt1YK_HUitM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 16 + LSCB: 256 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37376 + LdsInitCVgprs: false + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Jf69jrN9Z4-iQ2t2y60MsQTFsDB9_gIxrHRzz2oaiiQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23552 + LdsInitCVgprs: false + LdsNumBytes: 23552 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23552 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6o6Mqj-RUjXtkwZ1K2RrFXlC3Q-a2y2vEOhYluZ4pCok= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT20mCDNMpt7viEEIOBxhxMtxX9Mc4gjN024wHUZQhAVXc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 189440 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 189440 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yFcFPlsGw5B8R9atFrCRXX3pXqiO9SpMqMMeOGD1qxE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 4 + LSPB: 1 + LVCA: 16 + LVCB: 64 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KLwYNiZFx8XC0SNCEsNLwUVAoXl9bsjrmRA1WIofO6w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 16 + LSCB: 256 + LSPA: 16 + LSPB: 1 + LVCA: 16 + LVCB: 256 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37376 + LdsInitCVgprs: false + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114944 + LdsInitCVgprs: false + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6x3uoXhTokfgXutkdVLR7kOydsFIqKOj0o71H_2PuOx8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 132096 + LdsInitCVgprs: false + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114944 + LdsInitCVgprs: false + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjSUpYYGuR7UmIaRfL8rbbKYy6b1BuxrRuEwINN-ueM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 1 + LVCA: 4 + LVCB: 64 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25088 + LdsInitCVgprs: false + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 1024, 1, 4096] + - [0, 28.91] + - - [4096, 1024, 1, 128] + - [1, 44.33] + - - [2011, 1024, 1, 512] + - [2, 55.01] + - - [1024, 8192, 1, 512] + - [3, 95.36] + - - [3072, 8192, 1, 512] + - [4, 103.91] + - - [4352, 8192, 1, 256] + - [5, 87.22] + - - [4608, 8192, 1, 256] + - [6, 86.41] + - - [5120, 8192, 1, 128] + - [7, 63.17] + - - [5640, 8192, 1, 128] + - [8, 58.59] + - - [7296, 8192, 1, 128] + - [6, 65.76] + - - [10880, 8192, 1, 128] + - [9, 65.6] + - - [2732, 17711, 1, 384] + - [10, 90.61] + - - [928, 17711, 1, 128] + - [11, 59.92] + - - [4132, 4096, 1, 256] + - [12, 77.55] + - - [2048, 4096, 1, 160] + - [13, 64.51] + - - [4132, 4096, 1, 512] + - [12, 98.72] + - - [2624, 4096, 1, 2048] + - [14, 115.68] + - - [2048, 4096, 1, 57] + - [13, 29.92] + - - [1600, 4096, 1, 128] + - [13, 52.68] + - - [256, 1024, 1, 128] + - [15, 5.57] + - - [128, 1024, 1, 1] + - [16, 0.03] + - - [128, 1024, 1, 128] + - [17, 2.92] + - - [512, 1024, 1, 128] + - [18, 9.94] + - - [128, 8192, 1, 2440] + - [19, 53.83] + - - [256, 8192, 1, 1] + - [20, 0.28] + - - [256, 8192, 1, 120] + - [21, 22.65] + - - [256, 8192, 1, 256] + - [22, 40.67] + - - [256, 8192, 1, 512] + - [22, 54.43] + - - [256, 8192, 1, 528] + - [23, 50.73] + - - [256, 8192, 1, 2048] + - [24, 71.81] + - - [512, 8192, 1, 120] + - [25, 35.22] + - - [252, 17711, 1, 128] + - [26, 39.0] + - - [128, 17711, 1, 128] + - [27, 25.45] + - - [128, 17711, 1, 960] + - [27, 58.48] + - - [64, 819200, 1, 64] + - [28, 29.19] + - - [224, 527553, 1, 64] + - [29, 39.78] + - - [224, 752863, 1, 64] + - [30, 41.01] + - - [512, 4096, 1, 512] + - [31, 54.79] + - - [180, 4096, 1, 320] + - [32, 21.73] + - - [256, 4096, 1, 256] + - [33, 26.1] + - - [256, 4096, 1, 180] + - [34, 17.83] + - - [512, 4096, 1, 160] + - [35, 31.19] + - - [256, 4096, 1, 116] + - [34, 13.24] + - - [256, 4096, 1, 28] + - [36, 4.17] + - - [512, 4096, 1, 1] + - [37, 0.27] + - - [256, 4096, 1, 1] + - [36, 0.15] + - - [192, 655360, 1, 112] + - [5, 52.18] + - - [288, 806154, 1, 64] + - [38, 40.27] + - - [192, 655360, 1, 48] + - [39, 34.07] + - - [82, 262144, 1, 32] + - [40, 16.68] + - - [128, 4096, 1, 2048] + - [41, 44.23] + - - [72, 4096, 1, 320] + - [42, 10.63] + - - [28, 4096, 1, 320] + - [43, 5.04] + - - [64, 102400, 1, 64] + - [28, 24.71] + - - [72, 4096, 1, 256] + - [44, 9.21] + - - [256, 4096, 1, 72] + - [45, 9.58] + - - [160, 655360, 1, 10] + - [12, 7.73] + - - [64, 131072, 1, 128] + - [46, 36.73] + - - [28, 4096, 1, 256] + - [47, 4.54] + - - [512, 4096, 1, 2246] + - [48, 78.03] + - - [1, 1024, 1, 128] + - [49, 0.02] + - - [1, 4096, 1, 512] + - [50, 0.3] + - - [1, 4096, 1, 256] + - [50, 0.18] + - - [1, 4096, 1, 1] + - [49, 0.0] + - - [16, 33, 8192, 128] + - [51, 8.99] + - - [64, 32, 4096, 200] + - [52, 22.3] + - - [200, 32, 4096, 64] + - [53, 15.84] + - - [1, 1, 1, 4096] + - [54, 0.0] + - - [512, 1, 1, 4096] + - [55, 0.2] + - - [256, 1, 1, 4096] + - [55, 0.1] + - - [256, 1024, 1, 7968] + - [56, 59.32] + - - [128, 1024, 1, 7456] + - [57, 39.35] + - - [256, 4096, 1, 7680] + - [58, 86.69] + - - [128, 1, 1, 8192] + - [59, 0.1] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index 06582460569..d1341df36c7 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -82,6 +82,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -131,7 +132,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -240,7 +241,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -318,242 +319,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32vSYX90ppDprEavxj8aYo4_RVHyec3tByzM_fy0-Q5p0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB7_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 - LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB7_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -603,7 +369,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -711,8 +477,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -790,6 +556,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -839,7 +606,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -947,8 +714,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -1026,6 +793,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1033,12 +801,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32HDLrFKOalrtw2KSVTZ7XOtxf5e65xHrmK1BWhomWgmY= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x32x32_MI32xpBVJCKYVdddJ9ZWJn_y6OBqYt3-BIpDKWLuXbExc_BA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -1053,7 +821,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -1063,7 +831,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1075,256 +843,20 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 - LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 1 - ThreadTileA: 64 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x32x32_MI32xpBVJCKYVdddJ9ZWJn_y6OBqYt3-BIpDKWLuXbExc_BA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22016 + LdsBytesNoAmax: 22016 LdsInitCVgprs: false LdsNumBytes: 22016 LdsNumElementsAlignedA: 17408 @@ -1419,8 +951,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -1498,6 +1030,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1547,7 +1080,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -1655,8 +1188,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -1734,6 +1267,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1783,7 +1317,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -1891,8 +1425,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -1970,6 +1504,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2019,7 +1554,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -2127,8 +1662,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -2206,6 +1741,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2255,7 +1791,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -2363,8 +1899,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -2442,6 +1978,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2491,7 +2028,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -2599,8 +2136,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -2678,6 +2215,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2727,7 +2265,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -2835,8 +2373,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -2914,6 +2452,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2963,7 +2502,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -3071,8 +2610,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM4_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -3150,6 +2689,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3157,7 +2697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xe1BNDmI7JV_4uGeYH8WBcxkTo3OOZcdUmGTChlcXZl4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xceJrtbRzoZ8U_X3c8TU-g5N3iVDPEh4CXnMQuJUum-0= BufferLoad: true BufferStore: true CUCount: null @@ -3187,7 +2727,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3199,7 +2739,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB5_NTC2_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 128 LSCB: 128 @@ -3231,15 +2771,15 @@ LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -3247,9 +2787,9 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 @@ -3275,16 +2815,16 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 + NonTemporalA: 3 + NonTemporalB: 6 NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 + NumElementsPerBatchStore: 16 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -3307,8 +2847,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB5_NTC2_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -3318,28 +2858,28 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3351,14 +2891,14 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 @@ -3382,10 +2922,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3393,7 +2934,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16x8OQVJ6szc4qGEeyzu1Mvik5oji8_0NULO2KTqu03Wgo= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x64x64_MI16x1ZYzOMFyvQ_WZv4g-rr80VpnruR-8YsAaclS0uKKTkXQ= BufferLoad: true BufferStore: true CUCount: null @@ -3403,10 +2944,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3423,7 +2964,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3435,34 +2976,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB5_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB4_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 + LdsBytesNoAmax: 45056 LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 45056 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 + LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3470,10 +3011,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3483,15 +3024,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3505,29 +3046,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalB: 4 + NonTemporalC: 2 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -3535,7 +3076,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3543,8 +3084,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB5_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB4_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -3554,22 +3095,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3587,29 +3128,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -3618,10 +3159,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3629,20 +3171,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x32_MI16xQeJZY1hhTV13Iu-kbxWY2AO4qcIeNqeUeKM_ciFj3tg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT112x320x32_MI16G4GEHbAdPUcJY9T4f4Vw8xA9HnOy9eBCKZR-rA6UYjQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3653,13 +3195,13 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3671,34 +3213,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x320x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 36864 + LdsBytesNoAmax: 123904 LdsInitCVgprs: false - LdsNumBytes: 36864 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 16128 + LdsNumElementsAlignedB: 42240 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 74752 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 36864 - LdsOffsetMetadata_Blk: 74752 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3706,8 +3248,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -3719,15 +3261,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 5] + MIWaveTileA: 7 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 + MacroTile0: 112 + MacroTile1: 320 + MacroTileA: 112 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3741,28 +3283,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 2 - NumLoadsB: 6 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 140 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 14 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularB: 10 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3779,39 +3321,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC32_WGMXCCGn1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x320x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 - TransposeLDS: 1 + ThreadTile0: 28 + ThreadTile1: 5 + ThreadTileA: 28 + ThreadTileB: 5 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3823,16 +3365,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3858,6 +3400,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3865,7 +3408,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xceJrtbRzoZ8U_X3c8TU-g5N3iVDPEh4CXnMQuJUum-0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xKd3xOQJnZvH7NER92h_FWeDLMN6llBgbuSKPfS-dh_8= BufferLoad: true BufferStore: true CUCount: null @@ -3907,7 +3450,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 128 LSCB: 128 @@ -3983,14 +3526,14 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 0 NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 5 + NonTemporalC: 6 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 12 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 @@ -4015,8 +3558,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4094,6 +3637,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4101,17 +3645,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x64x64_MI16x1ZYzOMFyvQ_WZv4g-rr80VpnruR-8YsAaclS0uKKTkXQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16_yRrZ54pxRgRukp1Qzk0QYQwx3MBBzFzyA0wCMrV_ho= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -4121,7 +3665,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -4131,7 +3675,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4143,34 +3687,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB4_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 27648 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -4180,8 +3724,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -4192,14 +3736,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4220,20 +3764,20 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 1 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -4242,8 +3786,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4251,18 +3795,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB4_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -4274,16 +3818,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 2 - ThreadTileA: 12 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4295,8 +3839,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -4304,14 +3848,14 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -4326,10 +3870,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4337,7 +3882,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT112x320x32_MI16G4GEHbAdPUcJY9T4f4Vw8xA9HnOy9eBCKZR-rA6UYjQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x1tj_kzs63yw3mFQwAEJiL4iTevb3pL7nbZor8PNO8HKQ= BufferLoad: true BufferStore: true CUCount: null @@ -4347,10 +3892,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4361,7 +3906,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -4379,34 +3924,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x320x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123904 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 123904 - LdsNumElementsAlignedA: 16128 - LdsNumElementsAlignedB: 42240 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16128 - LdsOffsetB_Blk: 81664 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 46592 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16128 - LdsOffsetMetadata_Blk: 81664 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 46592 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -4414,12 +3959,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -4428,14 +3973,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [7, 5] - MIWaveTileA: 7 - MIWaveTileB: 5 + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 112 - MacroTile1: 320 - MacroTileA: 112 - MacroTileB: 320 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4449,28 +3994,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 + NonTemporalA: 1 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 140 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 14 - NumLoadsB: 10 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4479,7 +4024,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4487,22 +4032,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x320x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -4510,10 +4055,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 5 - ThreadTileA: 28 - ThreadTileB: 5 + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4539,15 +4084,15 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -4566,6 +4111,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4573,27 +4119,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xKd3xOQJnZvH7NER92h_FWeDLMN6llBgbuSKPfS-dh_8= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x12f43K0NWsDi5cNJjR7R2JTcg_yqtfEHj2PuqgoJwtDk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -4603,7 +4149,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4615,34 +4161,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -4650,10 +4196,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -4663,15 +4209,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4685,28 +4231,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 + NonTemporalB: 0 NonTemporalC: 6 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4714,7 +4260,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -4723,8 +4269,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4734,21 +4280,21 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -4767,23 +4313,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -4802,6 +4348,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4809,7 +4356,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1yIHfX59pen49io9SVtqOmH7NfRyptehvc2aTbGBwn6Y= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3okxSER2kK2UA9ov1OBTPhQlusC6KlbBX40fJhWBRxnQ= BufferLoad: true BufferStore: true CUCount: null @@ -4819,10 +4366,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4851,97 +4398,97 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB5_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -4959,8 +4506,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB5_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4975,20 +4522,20 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true @@ -5004,7 +4551,7 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -5016,10 +4563,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5034,10 +4581,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5045,7 +4593,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16_yRrZ54pxRgRukp1Qzk0QYQwx3MBBzFzyA0wCMrV_ho= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1vHhtns6gPgv2zdnViNHWKe69UlMziiY8ogzUIM-cXLA= BufferLoad: true BufferStore: true CUCount: null @@ -5065,7 +4613,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -5075,7 +4623,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5087,7 +4635,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -5097,24 +4645,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5136,14 +4684,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5164,21 +4712,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5186,8 +4734,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5195,18 +4743,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -5218,10 +4766,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5239,8 +4787,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -5248,7 +4796,7 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5270,10 +4818,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5281,7 +4830,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x1c8C3ENIpF0Lj2Vn5y8emDWGgLXKNNOk6OuhBiu-fS00= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16axuOnARtwf3Zw_BH37e0vrXw3uX3YI2sxTIy41cp-BQ= BufferLoad: true BufferStore: true CUCount: null @@ -5292,9 +4841,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5311,7 +4860,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5323,7 +4872,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB5_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -5333,24 +4882,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 46592 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5358,8 +4907,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -5371,15 +4920,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5393,28 +4942,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalA: 1 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5431,33 +4980,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB5_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5475,16 +5024,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5497,7 +5046,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -5506,10 +5055,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5517,17 +5067,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x1tj_kzs63yw3mFQwAEJiL4iTevb3pL7nbZor8PNO8HKQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1j604NGXToiBb5XPUVM0hkt4WXLh6NV34LIceG_D6gTg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -5547,7 +5097,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5559,34 +5109,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 46592 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5596,10 +5146,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5607,15 +5157,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5635,22 +5185,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 + NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5659,7 +5209,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5667,8 +5217,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5678,22 +5228,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5711,23 +5261,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5746,6 +5296,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5753,7 +5304,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x12f43K0NWsDi5cNJjR7R2JTcg_yqtfEHj2PuqgoJwtDk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1leQ8mjkIFyIZuDPwBWpvxiJZrseNgqKruLAXi8hCJNQ= BufferLoad: true BufferStore: true CUCount: null @@ -5773,7 +5324,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -5795,7 +5346,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -5806,13 +5357,13 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 51200 + LdsNumBytes: 57344 LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -5844,14 +5395,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5871,22 +5422,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5894,7 +5445,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -5903,8 +5454,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5927,9 +5478,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5948,7 +5499,7 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -5982,6 +5533,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5989,12 +5541,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3okxSER2kK2UA9ov1OBTPhQlusC6KlbBX40fJhWBRxnQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xt2IWy6oc3iaHRd0qwAxR-vlfdl6TmzLLHgr_zeUr1t0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -6019,7 +5571,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6031,7 +5583,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -6041,24 +5593,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 17408 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -6080,13 +5632,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -6107,21 +5659,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -6139,18 +5691,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -6162,9 +5714,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -6183,7 +5735,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -6218,6 +5770,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6225,12 +5778,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1vHhtns6gPgv2zdnViNHWKe69UlMziiY8ogzUIM-cXLA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xUaGw_W8LDx3RieUoB0YjL0FezgOAUjz28DCpyFICrg4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -6255,7 +5808,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6267,7 +5820,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -6277,14 +5830,14 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 51200 + LdsNumBytes: 59392 LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -6295,8 +5848,8 @@ LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 9216 LdsOffsetMetadata_Blk: 41984 - LdsPadA: 8 - LdsPadB: 8 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -6304,11 +5857,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -6316,23 +5869,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -6343,22 +5896,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6375,8 +5928,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -6386,21 +5939,21 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true @@ -6419,16 +5972,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6454,6 +6007,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6461,7 +6015,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16axuOnARtwf3Zw_BH37e0vrXw3uX3YI2sxTIy41cp-BQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xnPDI4eDL8B8YkIMGYr-3DuqBaS61ghWLUzLSEYUQwcQ= BufferLoad: true BufferStore: true CUCount: null @@ -6503,7 +6057,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC5_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -6516,21 +6070,21 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 135168 + LdsNumBytes: 116224 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 + LdsOffsetA_Blk: 65536 LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -6543,7 +6097,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -6552,14 +6106,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] + MIWaveTile: [4, 2] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6580,21 +6134,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalB: 5 + NonTemporalC: 5 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6611,22 +6165,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC5_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -6635,9 +6189,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 16 - ThreadTileB: 4 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6656,15 +6210,15 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6677,7 +6231,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -6690,6 +6244,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6697,7 +6252,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT320x192x32_MI32tuSZnvvLkXYcMml-mOShaT9zCWrl5dAd9wDAlRsHyF0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI167teIb582yCf_SJBIlrVLw1F7Ht3BtRFeb0kjf6Rcpyk= BufferLoad: true BufferStore: true CUCount: null @@ -6707,10 +6262,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6722,12 +6277,12 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6739,48 +6294,48 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 119808 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 119808 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -6788,49 +6343,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 3] - MIWaveTileA: 5 - MIWaveTileB: 3 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 4 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 240 - NumLoadsA: 10 - NumLoadsB: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6847,39 +6402,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: true StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 3 - ThreadTileA: 80 - ThreadTileB: 3 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6891,29 +6446,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -6922,10 +6477,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6933,7 +6489,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1j604NGXToiBb5XPUVM0hkt4WXLh6NV34LIceG_D6gTg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI162pz3v-E27-meies96c7sXE26vltpCAQ0pdpY2inCyGg= BufferLoad: true BufferStore: true CUCount: null @@ -6963,7 +6519,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6975,7 +6531,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -6985,24 +6541,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 69632 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 165888 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7024,14 +6580,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7051,22 +6607,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7083,18 +6639,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -7106,16 +6662,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -7127,16 +6683,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 2 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7162,6 +6718,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7169,7 +6726,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1leQ8mjkIFyIZuDPwBWpvxiJZrseNgqKruLAXi8hCJNQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16Rai2BMU5qUNE8ec14Xy8sjzYfYE4wTy8B1l9bpYhNTM= BufferLoad: true BufferStore: true CUCount: null @@ -7199,7 +6756,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7211,7 +6768,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -7221,24 +6778,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7260,14 +6817,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7287,22 +6844,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7311,7 +6868,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7319,22 +6876,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -7342,16 +6899,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -7363,16 +6920,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7385,7 +6942,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -7398,6 +6955,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7405,7 +6963,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xt2IWy6oc3iaHRd0qwAxR-vlfdl6TmzLLHgr_zeUr1t0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI16_hmfRslJ7_UM9E5_Eh9cjKTPKXxOaGMYtgTpwcJhKMM= BufferLoad: true BufferStore: true CUCount: null @@ -7435,7 +6993,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7447,7 +7005,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -7457,26 +7015,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 125952 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 125952 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -7484,11 +7042,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -7496,23 +7054,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -7523,22 +7081,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7547,7 +7105,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7555,8 +7113,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -7565,23 +7123,23 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7599,16 +7157,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7630,10 +7188,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7641,7 +7200,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xUaGw_W8LDx3RieUoB0YjL0FezgOAUjz28DCpyFICrg4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI32WgES1xrDYvB_Vsiz8nGLdWqeSq_GbrDnca8tIfV9V44= BufferLoad: true BufferStore: true CUCount: null @@ -7683,7 +7242,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -7694,23 +7253,23 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 41472 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 88576 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -7723,7 +7282,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7731,14 +7290,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 160 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 160 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -7762,18 +7321,18 @@ NonTemporalA: 3 NonTemporalB: 0 NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 5 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -7791,8 +7350,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -7801,29 +7360,29 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 4 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 80 + ThreadTile1: 1 + ThreadTileA: 80 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -7836,15 +7395,15 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7870,6 +7429,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7877,7 +7437,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xnPDI4eDL8B8YkIMGYr-3DuqBaS61ghWLUzLSEYUQwcQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x160x32_MI16seqDTzLmKwfO-jGw_7q0e0kRLQ9kjh4yCHvuyvaH2cs= BufferLoad: true BufferStore: true CUCount: null @@ -7887,7 +7447,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -7907,7 +7467,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7919,34 +7479,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC5_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 116224 LdsInitCVgprs: false LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedA: 29568 + LdsNumElementsAlignedB: 21120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 29568 + LdsOffsetB_Blk: 95104 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 29568 + LdsOffsetMetadata_Blk: 95104 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7956,10 +7516,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7968,14 +7528,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [7, 5] + MIWaveTileA: 7 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 224 + MacroTile1: 160 + MacroTileA: 224 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7996,21 +7556,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 140 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 7 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8019,7 +7579,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8027,22 +7587,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 33 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC5_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -8050,10 +7610,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 28 + ThreadTile1: 5 + ThreadTileA: 28 + ThreadTileB: 5 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8071,8 +7631,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -8080,14 +7640,14 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -8106,6 +7666,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8113,7 +7674,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI167teIb582yCf_SJBIlrVLw1F7Ht3BtRFeb0kjf6Rcpyk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xN2l8jPNpvUrR9ihG-g4bbj4txPWtM9VLB2cEdM2wQDo= BufferLoad: true BufferStore: true CUCount: null @@ -8123,7 +7684,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -8143,7 +7704,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8155,34 +7716,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 99328 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8192,10 +7753,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -8204,14 +7765,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8231,22 +7792,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 2 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8263,22 +7824,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 34 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC16_WGMXCCGn1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -8286,16 +7847,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -8307,23 +7868,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -8342,6 +7903,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8349,7 +7911,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI323ndqhClmLf66x-DMa5vtrsXdR3sinuyraIFtI5pO3GY= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1KDrJ4Ic4uP1T91fz80L-1p9uxPjaRg8FouAoxpGOl90= BufferLoad: true BufferStore: true CUCount: null @@ -8369,7 +7931,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -8379,7 +7941,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8391,7 +7953,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB6_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -8401,26 +7963,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 30720 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -8428,11 +7990,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -8440,23 +8002,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8467,22 +8029,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8490,7 +8052,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -8499,32 +8061,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 35 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB6_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC16_WGMXCCGn1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 64 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true @@ -8543,16 +8105,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8574,10 +8136,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8585,12 +8148,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI162pz3v-E27-meies96c7sXE26vltpCAQ0pdpY2inCyGg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3u9DB_8S6rR7nconKZLZIT-er2SW2W92RF0q4O62ZiGo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -8605,7 +8168,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -8615,7 +8178,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8627,36 +8190,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 69632 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 69632 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 165888 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 69632 - LdsOffsetMetadata_Blk: 165888 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -8664,35 +8227,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8704,30 +8267,30 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8735,8 +8298,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 36 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -8745,23 +8308,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8779,16 +8342,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8814,6 +8377,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8821,12 +8385,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16Rai2BMU5qUNE8ec14Xy8sjzYfYE4wTy8B1l9bpYhNTM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3Jm5CiGdfbFXujnDqBqMXTdu4rLuafV0fP7zUlp_z2K4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -8841,7 +8405,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -8851,7 +8415,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8863,36 +8427,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -8900,35 +8464,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8939,31 +8503,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 3 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalC: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8971,33 +8535,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 37 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9015,16 +8579,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9037,7 +8601,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -9050,6 +8614,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9057,7 +8622,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI16_hmfRslJ7_UM9E5_Eh9cjKTPKXxOaGMYtgTpwcJhKMM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3csRS0Q7OBQrJTgMfhnmBap6_YbZwht-0nasXx6jerRk= BufferLoad: true BufferStore: true CUCount: null @@ -9077,7 +8642,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -9099,36 +8664,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 125952 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 125952 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 91136 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -9136,35 +8701,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 8] - MIWaveTileA: 5 - MIWaveTileB: 8 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9175,31 +8740,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 5 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9207,33 +8772,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 38 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 8 - ThreadTileA: 20 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9252,15 +8817,15 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9286,6 +8851,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9293,12 +8859,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI32WgES1xrDYvB_Vsiz8nGLdWqeSq_GbrDnca8tIfV9V44= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1G-Hk-cvpdoZpl53xXJtdFhfZ8BYYo4TdUsDuo483nkI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -9335,7 +8901,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -9346,25 +8912,25 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41472 + LdsBytesNoAmax: 14336 LdsInitCVgprs: false - LdsNumBytes: 41472 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 14336 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41472 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 14336 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -9372,35 +8938,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 1] - MIWaveTileA: 5 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9413,20 +8979,20 @@ NonTemporal: -1 NonTemporalA: 3 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 6 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9435,7 +9001,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9443,8 +9009,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 39 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -9453,23 +9019,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 1 - ThreadTileA: 80 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9488,7 +9054,7 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -9496,7 +9062,7 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9518,10 +9084,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9529,7 +9096,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x32x64_MI16xSLoxTzKJhLeYO6yA7D7HcG6nMu1LARYTpZpldzWXL4U= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1-xkfgfCFtFLl4KyFyWYpOSiCNaOYMrDlymFG759qbfs= BufferLoad: true BufferStore: true CUCount: null @@ -9539,7 +9106,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -9571,34 +9138,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA5_NTB7_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 + LdsBytesNoAmax: 30720 LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 64512 + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 64512 - LdsOffsetB_Blk: 195584 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 195584 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -9608,8 +9175,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -9620,14 +9187,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [7, 1] - MIWaveTileA: 7 - MIWaveTileB: 1 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 32 - MacroTileA: 224 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9647,21 +9214,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 14 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -9671,7 +9238,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9679,14 +9246,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 40 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA5_NTB7_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -9702,10 +9269,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 1 - ThreadTileA: 28 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9724,7 +9291,7 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -9732,14 +9299,14 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -9758,6 +9325,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9765,7 +9333,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x160x32_MI16seqDTzLmKwfO-jGw_7q0e0kRLQ9kjh4yCHvuyvaH2cs= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3fwCm5jm6HmwYFF2F9NEH2-i6ut4jdo_U1ZXcaGHnHUQ= BufferLoad: true BufferStore: true CUCount: null @@ -9807,36 +9375,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 28864 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 29568 - LdsNumElementsAlignedB: 21120 + LdsNumBytes: 28864 + LdsNumElementsAlignedA: 4160 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 29568 - LdsOffsetB_Blk: 95104 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4160 + LdsOffsetB_Blk: 20544 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 29568 - LdsOffsetMetadata_Blk: 95104 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4160 + LdsOffsetMetadata_Blk: 20544 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -9844,35 +9412,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 5] - MIWaveTileA: 7 - MIWaveTileB: 5 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 160 - MacroTileA: 224 - MacroTileB: 160 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9883,23 +9451,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 140 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 7 - NumLoadsB: 5 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 5 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9907,7 +9475,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9915,39 +9483,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 41 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 5 - ThreadTileA: 28 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -9966,9 +9534,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9994,6 +9562,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10001,7 +9570,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xN2l8jPNpvUrR9ihG-g4bbj4txPWtM9VLB2cEdM2wQDo= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x1qqLf9V2sWUhTTjo08OvLbGEZOd8IOsdSVoXLWBlvt8= BufferLoad: true BufferStore: true CUCount: null @@ -10021,7 +9590,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -10043,7 +9612,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 128 LSCB: 128 @@ -10056,11 +9625,11 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 99328 + LdsNumBytes: 116224 LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -10083,7 +9652,7 @@ LoopIters: 4 LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -10092,14 +9661,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10120,21 +9689,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10142,7 +9711,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -10151,8 +9720,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 42 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -10175,9 +9744,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10196,7 +9765,7 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -10230,6 +9799,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10237,27 +9807,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1KDrJ4Ic4uP1T91fz80L-1p9uxPjaRg8FouAoxpGOl90= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x2tRUZll8ei_2fEcBgvm8QHlf-NZKX4GlNtbfPsV0B7Y= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -10279,34 +9849,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -10314,10 +9884,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -10349,28 +9919,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 1 NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 4 + NonTemporalC: 6 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 12 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10378,7 +9948,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -10387,8 +9957,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 43 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -10414,13 +9984,13 @@ ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false UseDot2F32XEmulation: true @@ -10440,14 +10010,14 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -10464,8 +10034,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10473,12 +10044,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3u9DB_8S6rR7nconKZLZIT-er2SW2W92RF0q4O62ZiGo= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI162FQf5x0Ic9M0LiIiM9wZ1HB5yGNVuG3Z50nYqhZpdHA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -10493,9 +10064,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10503,7 +10075,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10512,39 +10084,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -10552,35 +10124,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -10591,31 +10163,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10623,33 +10195,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 44 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 64 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10659,7 +10231,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -10667,16 +10240,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10689,19 +10262,21 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10709,12 +10284,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3Jm5CiGdfbFXujnDqBqMXTdu4rLuafV0fP7zUlp_z2K4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI162WYI2MxHIrVkq3FQOShW9mBomKJ0mhyfVvurs93vJTE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -10729,9 +10304,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10739,7 +10315,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10748,39 +10324,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -10788,35 +10364,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -10827,31 +10403,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10859,43 +10435,44 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 45 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -10903,16 +10480,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10930,14 +10507,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10945,7 +10524,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3csRS0Q7OBQrJTgMfhnmBap6_YbZwht-0nasXx6jerRk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16S2etZF6YGoCywFVWpPuTkl82pskkkPSvkb_V_rvYnYM= BufferLoad: true BufferStore: true CUCount: null @@ -10965,9 +10544,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10975,7 +10555,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10984,39 +10564,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -11024,35 +10604,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -11063,31 +10643,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11095,43 +10675,44 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 46 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -11139,16 +10720,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11166,14 +10747,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11181,7 +10764,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1G-Hk-cvpdoZpl53xXJtdFhfZ8BYYo4TdUsDuo483nkI= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16zhQEOYLLGxWCuOEX40LQg-IfZTSUsdq7e3ct5BY82X4= BufferLoad: true BufferStore: true CUCount: null @@ -11204,14 +10787,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11220,37 +10804,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14336 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 14336 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14336 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11272,17 +10856,17 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 MatrixInstK: 32 @@ -11301,20 +10885,20 @@ NonTemporal: -1 NonTemporalA: 3 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11331,22 +10915,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 47 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 4 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -11354,10 +10938,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11367,7 +10951,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -11375,15 +10960,15 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 2 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -11397,19 +10982,21 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11417,7 +11004,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1-xkfgfCFtFLl4KyFyWYpOSiCNaOYMrDlymFG759qbfs= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xR68Z4R7jyAOkgwfr2W7Y3XG7smz3NAVO_kD672DuYEg= BufferLoad: true BufferStore: true CUCount: null @@ -11437,17 +11024,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11456,37 +11044,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11507,15 +11095,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11536,21 +11124,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalB: 5 + NonTemporalC: 1 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11558,7 +11146,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -11567,8 +11155,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 48 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -11577,23 +11165,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11603,24 +11191,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11629,7 +11218,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -11638,14 +11227,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11653,7 +11244,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3fwCm5jm6HmwYFF2F9NEH2-i6ut4jdo_U1ZXcaGHnHUQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16x7eJMtzuep7ti0IbbweWe307G-lT2FmDeeBlMvdoLAfg= BufferLoad: true BufferStore: true CUCount: null @@ -11663,7 +11254,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -11676,6 +11267,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -11692,39 +11284,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28864 + LdsBytesNoAmax: 99328 LdsInitCVgprs: false - LdsNumBytes: 28864 - LdsNumElementsAlignedA: 4160 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4160 - LdsOffsetB_Blk: 20544 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4160 - LdsOffsetMetadata_Blk: 20544 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -11732,35 +11324,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [2, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -11771,23 +11363,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 - NonTemporalE: 0 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 7 + NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -11803,8 +11395,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 49 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -11819,16 +11411,16 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -11839,12 +11431,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -11854,18 +11447,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -11874,14 +11467,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11889,29 +11484,30 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x1qqLf9V2sWUhTTjo08OvLbGEZOd8IOsdSVoXLWBlvt8= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xByhPKgVjq4CDyWj-cbHC-i4QzDiSS4uWjZj1qyr1xvg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -11928,37 +11524,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 100352 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11966,10 +11562,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -11980,13 +11576,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveTile: [5, 2] + MIWaveTileA: 5 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 160 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 160 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -12001,28 +11597,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12030,8 +11626,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12039,8 +11635,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 50 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -12062,9 +11658,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 20 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 20 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -12075,12 +11671,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -12092,16 +11689,16 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -12110,14 +11707,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12125,20 +11724,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x2tRUZll8ei_2fEcBgvm8QHlf-NZKX4GlNtbfPsV0B7Y= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI16x1R4sVsBWCeh56t4_hhONAiKYW1myOGPmVq0nXhjyJNZo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12148,6 +11747,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -12155,7 +11755,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12164,37 +11764,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 14336 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 14336 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 5120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 14336 + LdsOffsetMetadata_Blk: 25600 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -12202,10 +11802,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -12216,14 +11816,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12237,28 +11837,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 6 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 5 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 12 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12275,8 +11875,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 51 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -12286,7 +11886,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -12298,29 +11898,30 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -12328,16 +11929,16 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -12346,14 +11947,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12361,20 +11964,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16DbJizEd-UmOez0mMSJd0N_uM7eLvQVu0Z44HBDIDLQQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI16K1w2va1wKhvgZoTyL5zX8YLee7JqsF5V6wtSMY1I-GE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12384,7 +11987,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -12404,34 +12007,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB6_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -12439,10 +12042,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -12452,14 +12055,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 256 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 256 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -12474,28 +12077,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 1 NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12504,7 +12107,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12512,39 +12115,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 52 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB6_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 4 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -12564,35 +12167,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12600,7 +12204,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16hJIai0BInLZV7hgR5cDeaVnCphwevySseItKMzY2S58= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3HCPsg4TodlR6Q8cd0m5gwDltLOtlMg4yX28q5bkM3nc= BufferLoad: true BufferStore: true CUCount: null @@ -12610,7 +12214,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -12631,7 +12235,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12643,36 +12247,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB7_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 49408 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 49408 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 41088 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 8320 + LdsOffsetMetadata_Blk: 41088 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -12681,10 +12285,10 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -12692,23 +12296,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -12720,21 +12324,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12751,39 +12355,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 53 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB7_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -12793,28 +12397,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -12832,6 +12436,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12839,7 +12444,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI162FQf5x0Ic9M0LiIiM9wZ1HB5yGNVuG3Z50nYqhZpdHA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI1616MP9_S2iGj4HSyT5jYf0jksUCFwInC_pUDeGGq21_Y= BufferLoad: true BufferStore: true CUCount: null @@ -12864,7 +12469,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -12882,34 +12487,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 17408 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -12930,14 +12535,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 4] + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 256 MacroTile1: 256 - MacroTileA: 128 + MacroTileA: 256 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -12958,21 +12563,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 32 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -12990,39 +12595,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 54 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13042,9 +12647,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -13071,6 +12676,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13078,7 +12684,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1H-2kp_jB-adLzEFBXJF42UhvA7Xsaxxj-pEwhy1dSyQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1Tt22kH2e9CMY6k_Y8O_l57cXp8oRQ2KAhNqC8vG9-is= BufferLoad: true BufferStore: true CUCount: null @@ -13088,7 +12694,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -13109,7 +12715,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13121,34 +12727,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB2_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 - LVCA: 16 - LVCB: 16 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 9728 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 9728 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 21504 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -13158,10 +12764,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13169,15 +12775,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13197,23 +12803,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 6 + NonTemporalA: 5 + NonTemporalB: 4 + NonTemporalC: 0 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13221,7 +12827,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13229,39 +12835,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 55 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB2_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13274,23 +12880,23 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -13310,6 +12916,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13317,17 +12924,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3FEJ1XNoJafcDx9fHEvs_ccmeJEo5H77-RB7nM5avMyc= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1tPRLVkTDR1Eck8xeuIldFLH6hKJV_G81Iz6F9AruisE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -13337,7 +12944,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -13348,7 +12955,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13360,36 +12967,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 9216 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 9216 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -13397,35 +13004,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -13436,30 +13043,30 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 10 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -13468,8 +13075,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 56 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -13478,23 +13085,23 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -13510,26 +13117,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -13545,10 +13152,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13556,27 +13164,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xA0Nndy-Gm0_-9jnJPOikGUN_FqV0gpTqShRRDXkv1uQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x448x32_MI16kicIlK4eREQNKFD_4qwLZk8EPghmo4TEg7fAcQxRmDY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -13599,34 +13207,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 89088 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 89088 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 71680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 89088 + LdsOffsetMetadata_Blk: 148480 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -13634,12 +13242,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13647,15 +13255,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 448 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13669,28 +13277,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 7 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13698,8 +13306,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13707,33 +13315,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 57 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -13753,28 +13361,28 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -13782,12 +13390,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13795,12 +13404,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI162WYI2MxHIrVkq3FQOShW9mBomKJ0mhyfVvurs93vJTE= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI32xLh-ycjs1oSwZbXiz2hnStK9M-0nCgCFZS7lNjPV_78M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -13818,7 +13427,6 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -13826,7 +13434,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13835,10 +13443,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -13848,26 +13456,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 109056 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -13875,35 +13483,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -13916,20 +13524,20 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 128 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13938,7 +13546,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13946,33 +13554,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 58 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -13982,8 +13590,7 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -13991,16 +13598,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14018,7 +13625,6 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -14027,6 +13633,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14034,7 +13641,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16z6K3Sb_rajFl7CvRhVdmX-td587lL_0kZQhA44LAkbg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xB_DE9RVLGbG_PwgSFrYUPdM2k3sTJC_ftD4lxU9bY6g= BufferLoad: true BufferStore: true CUCount: null @@ -14044,7 +13651,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -14057,15 +13664,14 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14074,39 +13680,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 150528 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 150528 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 66560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 75264 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 83968 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -14115,34 +13721,34 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14153,22 +13759,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14185,32 +13791,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 59 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 4 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -14221,8 +13827,7 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -14230,7 +13835,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -14238,15 +13843,15 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -14257,15 +13862,15 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14273,7 +13878,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16S2etZF6YGoCywFVWpPuTkl82pskkkPSvkb_V_rvYnYM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xNPMvn-evBNCDHAzhSPgPljENZ_iAGw5fkD0ulDNYMWc= BufferLoad: true BufferStore: true CUCount: null @@ -14296,7 +13901,6 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -14304,7 +13908,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14313,10 +13917,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -14326,26 +13930,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 76288 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 76288 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 76288 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -14353,35 +13957,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14392,22 +13996,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 2 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 5 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 256 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 2 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14416,7 +14020,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14424,44 +14028,43 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 60 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -14469,7 +14072,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -14477,8 +14080,8 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 24 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14491,12 +14094,11 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -14505,6 +14107,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14512,12 +14115,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16zhQEOYLLGxWCuOEX40LQg-IfZTSUsdq7e3ct5BY82X4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x384x32_MI32xJv85LAZjXvrheXhR6Pb2-1BpR8Zhns4NnJ21cIAl2Hw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -14535,15 +14138,14 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14552,39 +14154,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 129536 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 129536 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -14592,35 +14194,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14631,22 +14233,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 2 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14655,7 +14257,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14663,44 +14265,43 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 61 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 8 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -14708,16 +14309,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14730,20 +14331,20 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14751,7 +14352,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x64_MI16xM9LgWSLPQURN0LQ8_cZ_x-ShHUsOQ4UKzbLGoJT6DOA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xW4r92vEXVihDdHo-lHvJ0uNEpIQK9BpFRUZ08nMbzCs= BufferLoad: true BufferStore: true CUCount: null @@ -14761,10 +14362,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -14774,7 +14375,6 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -14782,7 +14382,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14791,50 +14391,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA2_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 69120 + LdsBytesNoAmax: 115712 LdsInitCVgprs: false - LdsNumBytes: 69120 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 115712 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 144896 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 69120 - LdsOffsetMetadata_Blk: 144896 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -14842,15 +14442,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 192 - MacroTileA: 48 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14864,28 +14464,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 7 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 - NumLoadsB: 12 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14902,68 +14502,67 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 62 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA2_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -14974,15 +14573,15 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14990,7 +14589,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI160gzIbQL4-jpzPH-VghbnvNdTTqRF2iTl-Bj4jYdi7Kc= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xwGhVm6bClKczrRVvyM1JDrgb5E0uKGXdH1TpIh1TL0o= BufferLoad: true BufferStore: true CUCount: null @@ -15000,7 +14599,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15010,10 +14609,9 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -15030,48 +14628,48 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA3_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 17408 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -15081,15 +14679,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15109,31 +14707,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 + NonTemporalA: 4 + NonTemporalB: 1 NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15141,68 +14739,67 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 63 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA3_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -15213,15 +14810,15 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15229,19 +14826,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xhzz0mUXjdKncv9yWH36CWKfQiJUxDnvM6oldFtHiGPQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x16x32_MI16x1TWdBy3MnQQKMOSifSRrwSeieAjAz_Ipy8XM9t_apH8I= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false @@ -15252,10 +14849,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -15269,75 +14865,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 29504 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 29504 + LdsNumElementsAlignedA: 10560 + LdsNumElementsAlignedB: 2560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 10560 + LdsOffsetB_Blk: 26944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 10560 + LdsOffsetMetadata_Blk: 26944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 80 + MacroTile1: 16 + MacroTileA: 80 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -15348,23 +14944,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 7 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 10 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15372,7 +14968,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15380,33 +14976,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 64 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC32_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 7 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15416,51 +15012,50 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15468,7 +15063,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xXqj08Ftg00Rv2SJi2FzFlO6u1HdA7oX4HR8160_tNf0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x7fd2kUwjm75D0kEujcLkV3_GTKT5wWnTcae2Fu7jWuw= BufferLoad: true BufferStore: true CUCount: null @@ -15478,7 +15073,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15491,7 +15086,6 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -15508,39 +15102,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC4_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 - LdsInitCVgprs: false - LdsNumBytes: 51200 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 86016 + LdsInitCVgprs: false + LdsNumBytes: 86016 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 131072 LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -15549,10 +15143,10 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 4 - LoopUnroll: 64 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -15560,23 +15154,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -15587,22 +15181,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15619,33 +15213,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 65 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC4_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15654,14 +15248,13 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -15671,18 +15264,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -15691,15 +15284,15 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15707,7 +15300,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x32_MI16x_7t2JVerTtHT1wEv8es6jrjUmRGlaHN5v8t1PAlVSK4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x80x128_MI16xEh4DCgEFCt-afK9cUIQCk_5N_QizF04nbCdMOaWqSgw= BufferLoad: true BufferStore: true CUCount: null @@ -15717,7 +15310,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15738,7 +15331,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15750,34 +15343,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 39424 + LdsBytesNoAmax: 156672 LdsInitCVgprs: false - LdsNumBytes: 39424 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 43520 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 70144 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 113152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 39424 - LdsOffsetMetadata_Blk: 70144 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 113152 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -15787,8 +15380,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -15798,15 +15391,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 80 + MacroTileA: 64 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15826,22 +15419,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 2 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 8 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15858,33 +15451,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 66 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15893,7 +15486,7 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false @@ -15903,23 +15496,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -15935,10 +15528,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15946,7 +15540,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xj7U-zgwpd4zPGOHREedIwdwjVu8pyLWNmqSZsXIQfiQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xIjYsJ7nPzSNt9gcCFNyYcDjxOwbmoQKbvftRgAY1G4I= BufferLoad: true BufferStore: true CUCount: null @@ -15956,7 +15550,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15966,7 +15560,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -15977,7 +15571,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15989,36 +15583,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 86016 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -16026,11 +15620,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -16038,23 +15632,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -16065,22 +15659,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 + NonTemporalA: 2 + NonTemporalB: 5 NonTemporalC: 1 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16088,7 +15682,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -16097,33 +15691,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 67 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16132,35 +15726,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16174,10 +15768,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16185,20 +15780,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x256_MI16x9N4FL5Gr-S5lZKFQp99NVD859UtIa5XyMClMxXi3-_8= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xG8s9vcohdFWaEPW_OKAxb_cYdQ6B4AQZ2nx9kjNOYto= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -16228,45 +15823,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB7_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 86016 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 99840 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 99840 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -16276,15 +15871,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16298,28 +15893,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 7 + NonTemporalB: 6 NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16336,33 +15931,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 68 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB7_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16371,7 +15966,7 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false @@ -16382,22 +15977,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -16411,12 +16006,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16424,7 +16020,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xR68Z4R7jyAOkgwfr2W7Y3XG7smz3NAVO_kD672DuYEg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x34z5jA9IIcOhaYTxrREvShaRXUEMAbqXkdSTqaoGUr0= BufferLoad: true BufferStore: true CUCount: null @@ -16434,7 +16030,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -16444,18 +16040,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16467,34 +16063,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 86016 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -16504,8 +16100,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -16515,15 +16111,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 256 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 256 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16544,21 +16140,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalB: 6 + NonTemporalC: 2 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16566,8 +16162,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16575,70 +16171,70 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 69 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16654,8 +16250,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16663,7 +16260,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xhRkmfbWaZLauRvxHEZRkWypKxx8tWVFg5-SELIV8PaU= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x9MB_BItjxFkx0YR1b98tLWirgDOvhTRoD-0flsTcQhA= BufferLoad: true BufferStore: true CUCount: null @@ -16673,7 +16270,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -16694,7 +16291,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16706,34 +16303,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27136 + LdsBytesNoAmax: 86016 LdsInitCVgprs: false - LdsNumBytes: 27136 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27136 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -16743,10 +16340,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16754,15 +16351,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16782,22 +16379,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16814,13 +16411,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 70 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -16830,26 +16427,26 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false @@ -16859,23 +16456,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -16891,10 +16488,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16902,30 +16500,30 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI16xDXqX3nOI1ws82SuzK10Xb7Lyblmqqz7YODwj8fdf6Ec= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI32o1cp8AHeO1wh-BhDUt3-ptcU4Mtb1M0g3Gvr_oirH9Q= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -16933,7 +16531,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16945,98 +16543,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152064 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 152064 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 101376 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 76032 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 84480 + LdsOffsetA_Blk: 262144 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 295936 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 84480 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 135168 + LdsOffsetMetadata_Blk: 295936 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 512 + MacroTile1: 192 MacroTileA: 64 - MacroTileB: 512 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 3 NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 16 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 24 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 24 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17044,8 +16642,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17053,22 +16651,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 71 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -17077,9 +16675,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 8 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17088,7 +16686,7 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false @@ -17098,42 +16696,43 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17141,7 +16740,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x320x32_MI16xEDT-TS7XsVhCqRrDXPaaM9Dw-865cKdJ2bN2UQP64eM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI323I4N4iBo8DW3RIRd0BHqSQeGBeqa7daq5LoZFfvEd3I= BufferLoad: true BufferStore: true CUCount: null @@ -17151,7 +16750,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -17161,7 +16760,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -17172,7 +16771,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17184,36 +16783,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA2_NTB5_NTC0_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 51200 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 101376 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 262144 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 295936 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59904 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 135168 + LdsOffsetMetadata_Blk: 295936 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -17221,35 +16820,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 320 + MacroTile1: 192 MacroTileA: 64 - MacroTileB: 320 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -17260,22 +16859,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 2 - NumLoadsB: 10 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 24 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 24 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17283,8 +16882,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17292,18 +16891,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 72 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA2_NTB5_NTC0_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -17316,9 +16915,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 5 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 5 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17327,35 +16926,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -17371,8 +16970,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17380,27 +16980,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI16x1ClaJAhs9bAnetPpejtwWnPhg54ltN5Lq3cuNtacQMlU= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1Fd3Wgq88OZ1PaojzfzjpDsCp8WKZiHDJo9GXAXrgRK0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -17423,7 +17023,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -17433,24 +17033,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -17458,12 +17058,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17471,14 +17071,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -17493,36 +17093,38 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -17531,14 +17133,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 73 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -17547,17 +17149,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17571,21 +17173,24 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -17606,12 +17211,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17619,20 +17225,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1sKWcpwZgqDGbasNzEHbLdPIjLWTrCFpI8w8F6Ex2Yns= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1Y2_PBeJGjEXXPI1_8Q1nplFuMWjj1kVwTjC8QiIVYG0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -17662,34 +17268,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -17697,12 +17303,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17710,15 +17316,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17732,29 +17338,31 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 6 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -17762,7 +17370,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17770,14 +17378,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 74 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -17786,23 +17394,23 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 3 ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -17810,9 +17418,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -17822,18 +17433,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -17845,12 +17456,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17858,7 +17470,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1EqYOzUyY9ygjHWYmTZonE5kaamHc6F8AcaxmECEx-PQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x64_MI161e0FqGLAMztP_dD6xxGdXRut9vwmQsZsotL-hvEenUg= BufferLoad: true BufferStore: true CUCount: null @@ -17868,7 +17480,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -17889,7 +17501,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17901,34 +17513,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 + LdsBytesNoAmax: 152064 LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 152064 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 42240 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 + LdsOffsetA_Blk: 76032 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 109824 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 109824 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -17938,8 +17550,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -17950,14 +17562,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17977,23 +17589,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 10 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18001,7 +17615,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18009,22 +17623,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 75 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -18032,10 +17646,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18049,12 +17663,15 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -18062,15 +17679,15 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -18086,10 +17703,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18097,7 +17715,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16x8EUEagTvWzblDujb4a6-sHj4LC2-vadXjUhTArIYoxM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16x0cYNusgHgF3Co_ShFYNVgf_9fASJQBYDTRah_BV_zEY= BufferLoad: true BufferStore: true CUCount: null @@ -18107,10 +17725,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18128,7 +17746,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18140,34 +17758,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -18175,10 +17793,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] @@ -18189,14 +17807,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18210,29 +17828,31 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18248,8 +17868,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 76 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -18258,12 +17878,12 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -18271,16 +17891,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -18288,28 +17908,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -18323,12 +17946,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18336,7 +17960,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI16x1EDz1q2jKSyQTb112LxnuUuMbJQuylVJAloo1XWaXWOY= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xWU2YXZSq42Zyg4I1M1bqQl8lWMhuX8bgcxNNF-WMw3U= BufferLoad: true BufferStore: true CUCount: null @@ -18346,7 +17970,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -18367,7 +17991,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18379,39 +18003,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 115712 LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 115712 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -18427,15 +18051,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18455,23 +18079,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18487,32 +18113,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 77 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM16_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -18527,28 +18153,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -18566,8 +18195,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18575,7 +18205,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1UXoAKHnq0-WZoXPcpEIvd8tVnS0VOqSLDnuTlvOr2KI= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Rdlq7Rc2vP_yhCpcNjpQdCrFtftCCMe2J1b1mDh_IUI= BufferLoad: true BufferStore: true CUCount: null @@ -18606,7 +18236,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18618,7 +18248,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -18631,23 +18261,23 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 98816 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 98816 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 82176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 16640 + LdsOffsetMetadata_Blk: 82176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -18655,11 +18285,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -18667,9 +18297,9 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 64 @@ -18680,10 +18310,10 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -18694,16 +18324,16 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 + NumElementsPerBatchStore: 16 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -18711,6 +18341,8 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: 4 + NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18726,33 +18358,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 78 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18766,20 +18398,23 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -18789,7 +18424,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -18803,10 +18438,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18814,7 +18450,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16x7eJMtzuep7ti0IbbweWe307G-lT2FmDeeBlMvdoLAfg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xFyWqcs6KVFJkQ8kLhxkp503bjalLMrD-hQMHlzpWf2g= BufferLoad: true BufferStore: true CUCount: null @@ -18824,17 +18460,17 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -18845,7 +18481,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18857,34 +18493,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -18892,12 +18528,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18906,14 +18542,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18927,36 +18563,38 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -18965,22 +18603,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 79 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -18988,10 +18626,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19005,28 +18643,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -19040,12 +18681,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19053,7 +18695,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xByhPKgVjq4CDyWj-cbHC-i4QzDiSS4uWjZj1qyr1xvg= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT8eZn8BuF1Nziw24iNKnDP44-Wc-OfwT54KqMWpiGeWHs= BufferLoad: true BufferStore: true CUCount: null @@ -19064,7 +18706,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -19075,15 +18717,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -19093,37 +18735,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100352 + LdsBytesNoAmax: 82432 LdsInitCVgprs: false - LdsNumBytes: 100352 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 82432 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 69632 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 143872 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 82432 + LdsOffsetMetadata_Blk: 143872 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -19136,7 +18778,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19144,15 +18786,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [5, 8] MIWaveTileA: 5 - MIWaveTileB: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 64 - MacroTileA: 160 - MacroTileB: 64 + MacroTile0: 80 + MacroTile1: 512 + MacroTileA: 80 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19172,22 +18814,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 40 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 NumLoadsA: 5 - NumLoadsB: 2 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19204,33 +18846,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 80 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 20 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 20 - ThreadTileB: 2 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19246,27 +18889,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -19281,10 +18924,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19292,7 +18936,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xd-sQFx5RImj-B93gHPT59pdW-PfpHkRagGWfDrPaTnY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3ZQt4emcWG8guHWikbv6OxFJy790l58gtvM3nfZjMIJE= BufferLoad: true BufferStore: true CUCount: null @@ -19302,10 +18946,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -19314,16 +18958,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19332,50 +18976,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 - LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19383,15 +19027,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19405,28 +19049,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19443,33 +19087,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 81 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19485,32 +19130,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -19518,12 +19163,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19531,7 +19177,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1-95uwA7thjYZN8Q0bg1wWhpCED9_VDGCoINGlVsybm0= BufferLoad: true BufferStore: true CUCount: null @@ -19541,8 +19186,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -19553,16 +19198,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19571,42 +19216,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -19614,7 +19259,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19622,9 +19267,9 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 @@ -19650,22 +19295,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 8 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19682,32 +19327,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 82 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -19724,28 +19370,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -19759,10 +19405,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19770,18 +19417,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32NrolHpaoNMGubHMXg2QZmfmFTUHLarzXy0RPUp34mNo= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3e4Dw_hz57yPEZN_qoLaorfGepQNCz75gt6VQs5_mgZo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 256 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -19792,15 +19439,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -19810,75 +19457,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100352 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 100352 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -19889,21 +19536,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -19921,33 +19568,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 83 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19963,36 +19611,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -20000,8 +19648,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20009,7 +19658,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI16x1R4sVsBWCeh56t4_hhONAiKYW1myOGPmVq0nXhjyJNZo= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nB1C3iOYxU_7DUOjuOyODiZ9rVSorIjLm4u2U10uDaw= BufferLoad: true BufferStore: true CUCount: null @@ -20019,8 +19668,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -20031,16 +19680,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20049,42 +19698,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14336 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 14336 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14336 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -20092,7 +19741,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20100,10 +19749,10 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 32 @@ -20128,22 +19777,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 + NumElementsPerBatchStore: 8 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20152,7 +19801,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20160,39 +19809,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 84 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -20205,24 +19855,24 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -20237,10 +19887,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20248,7 +19899,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1HgIojdE0sEyvw7DzcXO7orbn8GPiaT6KOVUT7VcaD98= BufferLoad: true BufferStore: true CUCount: null @@ -20258,8 +19908,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -20270,16 +19920,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20288,50 +19938,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB2_NTC1_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65024 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 65024 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20339,15 +19989,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 48 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20368,21 +20018,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20391,7 +20041,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20399,33 +20049,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 85 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB2_NTC1_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20444,24 +20095,24 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -20469,8 +20120,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -20478,8 +20129,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20487,7 +20139,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xRMVFeWautsW6BnxBD-9Gp9QjcP7DmKIwBt0NKkW_I0M= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3SwF0bvQxB0RrhRxtPMVt3TyizF16j4vW99jq_X9KpHk= BufferLoad: true BufferStore: true CUCount: null @@ -20498,9 +20150,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20509,16 +20161,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20527,37 +20179,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB0_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 8 + LSPB: 8 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -20565,12 +20217,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20578,15 +20230,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20600,29 +20252,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20638,33 +20290,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 86 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB0_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20680,27 +20333,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -20713,12 +20366,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20726,18 +20380,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI16K1w2va1wKhvgZoTyL5zX8YLee7JqsF5V6wtSMY1I-GE= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3YeRtOHUsp9ttyStfNKrKENY_vaUUjqgLCrwM91_2-aY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -20748,16 +20402,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20766,37 +20420,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB6_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -20806,10 +20460,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20817,15 +20471,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20845,23 +20499,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20869,7 +20523,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20877,39 +20531,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 87 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB6_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -20919,45 +20574,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 2 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20965,20 +20621,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3HCPsg4TodlR6Q8cd0m5gwDltLOtlMg4yX28q5bkM3nc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20987,16 +20642,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21005,61 +20660,61 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8320 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8320 - LdsOffsetB_Blk: 41088 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8320 - LdsOffsetMetadata_Blk: 41088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 64 @@ -21070,36 +20725,36 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 8 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21108,7 +20763,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21116,39 +20771,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 88 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21158,27 +20814,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -21186,17 +20842,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21204,18 +20861,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI16ZwWemFSm3p4igCb8DLsnt6R0foWhVdW4rDKTnbvNCrM= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3cSFHPElhrbZabUpjPK0idMvlUk8E6jLU5EZAOZd0T7g= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -21226,16 +20883,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21244,37 +20901,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -21284,10 +20941,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21295,15 +20952,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21324,22 +20981,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -21347,7 +21004,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21355,39 +21012,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 89 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC32_WGMXCCGn1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21397,28 +21055,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -21427,15 +21085,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21443,18 +21102,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x96x32_MI16xj2r8CbY8oh8svK2xXcWJE-9Rp3qvl1HSgXtzAYdPfBI= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1I2reK521Z0OJ8WcGapE1-BfPXpcGPiwVna1dA3-Ll3c= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 256 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -21465,16 +21124,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21483,50 +21142,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA3_NTB1_NTC5_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 - LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 15360 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21534,15 +21193,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 96 - MacroTileA: 192 - MacroTileB: 96 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21562,22 +21221,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21586,7 +21245,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21594,37 +21253,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 90 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA3_NTB1_NTC5_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 3 - ThreadTileA: 24 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -21636,45 +21296,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21682,7 +21343,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI1616MP9_S2iGj4HSyT5jYf0jksUCFwInC_pUDeGGq21_Y= BufferLoad: true BufferStore: true CUCount: null @@ -21693,7 +21353,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -21704,15 +21364,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -21722,37 +21382,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -21765,7 +21425,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21774,14 +21434,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21802,21 +21462,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21833,19 +21493,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 91 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -21855,17 +21515,18 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21875,7 +21536,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -21886,34 +21547,35 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 2 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21921,20 +21583,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI16xro-D8veBeNmE7sMrxwoD_VnMFoW5lt0JJ6_H2vXgEsE= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9yCMUvmmmKthOJiKqIB_mYxsvQAAqv6o_A39DiLPiA4s= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -21943,16 +21605,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21961,37 +21623,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 24576 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 48128 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -21999,12 +21661,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22012,15 +21674,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22034,28 +21696,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22064,7 +21726,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22072,32 +21734,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 92 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 12 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 12 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -22114,45 +21777,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22160,7 +21824,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI1632V6Pxwe_725MnjTV9-vmbwgaM0uPWz-Wc0cWfNZEtw= BufferLoad: true BufferStore: true CUCount: null @@ -22171,7 +21834,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -22182,15 +21845,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -22200,11 +21863,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -22243,7 +21906,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22281,12 +21944,12 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 8 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 @@ -22311,19 +21974,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 93 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -22333,17 +21996,18 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 ThreadTile1: 4 ThreadTileA: 16 ThreadTileB: 4 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -22353,7 +22017,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -22364,16 +22028,16 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -22381,17 +22045,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22399,7 +22064,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16q_1WHADry2cTxAoNmb44qR0nw6Q5GfHEK9JlilAlfio= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ULSHV5TvKv-3s7N5kPB29fM38e7xZ96gnPlfW9sr_dw= BufferLoad: true BufferStore: true CUCount: null @@ -22410,7 +22075,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -22421,15 +22086,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -22439,37 +22104,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 8 + LSPB: 8 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 13312 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -22482,7 +22147,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22490,15 +22155,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22519,22 +22184,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22550,33 +22215,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 94 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22592,36 +22258,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -22629,8 +22295,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22638,7 +22305,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32EK4Ew7js7-4eeoqDkT9c309xo8_wSmrv-mO16L7kjzE= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Gtebt9rQIpwe9oFGHoWcPG_t3BC0dLSpyLHC5C0Kq0M= BufferLoad: true BufferStore: true CUCount: null @@ -22649,7 +22316,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -22658,18 +22325,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22678,11 +22345,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -22691,8 +22358,8 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 34816 LdsInitCVgprs: false @@ -22709,8 +22376,8 @@ LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 34816 LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -22718,11 +22385,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -22730,9 +22397,9 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 MacroTile1: 128 @@ -22743,10 +22410,10 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -22757,16 +22424,16 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 8 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -22780,8 +22447,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22789,37 +22456,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 95 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -22831,27 +22499,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -22868,8 +22536,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22877,7 +22546,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1Tt22kH2e9CMY6k_Y8O_l57cXp8oRQ2KAhNqC8vG9-is= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6V3GWrV8q1KT6qcNF3NRQe6SaKzZRXLdh2CbWinGWLAU= BufferLoad: true BufferStore: true CUCount: null @@ -22887,8 +22556,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -22899,16 +22568,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22917,42 +22586,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9728 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 9728 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9728 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -22960,7 +22629,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22969,13 +22638,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -22996,23 +22665,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 4 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 + NumElementsPerBatchStore: 8 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -23028,19 +22697,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 96 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -23050,10 +22719,11 @@ SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -23070,27 +22740,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -23098,8 +22768,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -23107,8 +22777,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23116,18 +22787,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1tPRLVkTDR1Eck8xeuIldFLH6hKJV_G81Iz6F9AruisE= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zsxRwkMqBm0-RotUdPfZ2aFGnmO-YpXpopyCt-rV2s4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -23136,17 +22807,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -23156,50 +22827,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 + LVCA: 32 + LVCB: 32 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9216 + LdsBytesNoAmax: 67072 LdsInitCVgprs: false - LdsNumBytes: 9216 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 20992 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23207,15 +22878,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23235,30 +22906,30 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 8 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -23267,39 +22938,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 97 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -23313,23 +22985,23 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -23346,8 +23018,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23355,18 +23028,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1b-6QnEjVmiUVwlO7nIU1i3ohIg0WxzZTpmj7_cnGZsM= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HCXW7Md-kYsZ-1UnmrrC7uG9YBlO7z5sEs9KMCRnZRY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -23375,18 +23048,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23395,37 +23068,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB7_NTC6_NTD6_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -23435,10 +23108,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23446,15 +23119,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23475,20 +23148,20 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -23497,8 +23170,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23506,37 +23179,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 98 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB7_NTC6_NTD6_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -23551,24 +23225,24 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -23576,8 +23250,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -23585,8 +23259,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23594,7 +23269,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1P_TUSJ5Myg8d1FJPwt0Wuh6SkhZpVj2Mg6_2M6eLfpQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6oabzLLQwOO03l3MEmNqlhJ6dLm9NnzDdPu-7gnuHCJQ= BufferLoad: true BufferStore: true CUCount: null @@ -23604,8 +23279,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -23614,18 +23289,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23634,42 +23309,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA5_NTB2_NTC5_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24576 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 24576 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -23677,7 +23352,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23685,15 +23360,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23713,22 +23388,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23736,7 +23411,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -23745,33 +23420,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 99 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA5_NTB2_NTC5_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23787,27 +23463,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -23815,17 +23491,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23833,7 +23510,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI16xouCaO_kS3h9va9qBGG2ChOaMcrsz9moeqbtRNLm2voU= BufferLoad: true BufferStore: true CUCount: null @@ -23843,10 +23519,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -23855,15 +23531,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -23873,50 +23549,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB5_NTC3_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152576 + LdsBytesNoAmax: 99840 LdsInitCVgprs: false - LdsNumBytes: 152576 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 76288 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 84992 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 84992 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23924,15 +23600,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 8] + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 512 - MacroTileA: 64 - MacroTileB: 512 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23946,28 +23622,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23976,7 +23652,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23984,39 +23660,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 100 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB5_NTC3_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -24026,7 +23703,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -24036,35 +23713,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 2 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24072,7 +23750,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x48x32_MI16x77c5vCqhF37wngAzGqfnFLKmA0iPzLeWaJiRYnuXPkU= BufferLoad: true BufferStore: true CUCount: null @@ -24082,8 +23759,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -24094,16 +23771,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24112,42 +23789,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x48x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 7680 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -24155,7 +23832,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24163,15 +23840,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [5, 3] - MIWaveTileA: 5 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 48 - MacroTileA: 160 - MacroTileB: 48 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24192,22 +23869,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 10 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 3 - NumThreads: 128 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24223,33 +23900,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 101 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x48x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 3 - ThreadTileA: 20 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24265,32 +23943,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -24302,8 +23980,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24311,18 +23990,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32Q9GNPZBj4w2q8J-ZJZlKjlie8yP-WUZUJeJmisgU0-g= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wZn1lj5-xh86AxV8XC_TfIrJlVoj_3cWBkWnmMVsin8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -24333,15 +24012,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -24351,11 +24030,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -24365,25 +24044,25 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 + LdsBytesNoAmax: 72704 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 72704 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 72704 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -24391,11 +24070,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -24403,23 +24082,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 384 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -24430,22 +24109,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24454,7 +24133,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24462,39 +24141,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 102 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 1 - ThreadTileA: 64 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -24504,27 +24184,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -24532,8 +24212,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -24541,8 +24221,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24550,7 +24231,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x448x32_MI16kicIlK4eREQNKFD_4qwLZk8EPghmo4TEg7fAcQxRmDY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65gE-MCZm4_zHo6C5uZDRUoKD8EXmAPJ3N4uk_Y8yfys= BufferLoad: true BufferStore: true CUCount: null @@ -24561,7 +24242,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -24570,18 +24251,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24590,11 +24271,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -24603,24 +24284,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 89088 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 89088 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 71680 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 148480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 89088 - LdsOffsetMetadata_Blk: 148480 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -24633,7 +24314,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24641,15 +24322,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 448 - MacroTileA: 128 - MacroTileB: 448 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24670,21 +24351,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 4 - NumLoadsB: 14 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24692,7 +24373,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -24701,39 +24382,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 103 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -24746,24 +24428,24 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -24780,8 +24462,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24789,18 +24472,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32WmWOeC1HZXF52s2qw5enQVIcphzfzkpjTKp5gB44mAw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6I8O4paswkmb66D5oBp-XVQKwsaElMfkk9UWM0xAgsVs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -24811,16 +24494,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24829,11 +24512,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -24842,26 +24525,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 83968 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 83968 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -24869,35 +24552,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 4] - MIWaveTileA: 1 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -24908,22 +24591,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24932,7 +24615,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24940,19 +24623,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 104 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC8_WGMXCCGn1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -24962,11 +24645,12 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24982,27 +24666,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 48 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -25017,10 +24701,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25028,18 +24713,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI32xLh-ycjs1oSwZbXiz2hnStK9M-0nCgCFZS7lNjPV_78M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -25050,15 +24734,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25067,39 +24752,520 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 109056 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 109056 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHDQbHdXWrhDPz3uAOriInqMw0_ypUmAB2yHcbrln9g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -25108,34 +25274,34 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -25147,20 +25313,20 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 + NumElementsPerBatchStore: 8 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -25179,32 +25345,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 105 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25214,33 +25381,34 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -25249,14 +25417,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25264,7 +25434,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xB_DE9RVLGbG_PwgSFrYUPdM2k3sTJC_ftD4lxU9bY6g= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT189iqYKY55VEcPzDOXv9ylhp1p78P2DCBr2S6_6nafkw= BufferLoad: true BufferStore: true CUCount: null @@ -25274,10 +25444,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -25286,15 +25456,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25303,101 +25474,101 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 150528 + LdsBytesNoAmax: 79872 LdsInitCVgprs: false - LdsNumBytes: 150528 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 66560 + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 75264 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 83968 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 83968 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 512 - MacroTileA: 64 - MacroTileB: 512 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 16 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25415,31 +25586,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 106 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 20 ThreadTile1: 4 - ThreadTileA: 32 + ThreadTileA: 20 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -25450,15 +25622,16 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -25466,33 +25639,35 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25500,18 +25675,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xNPMvn-evBNCDHAzhSPgPljENZ_iAGw5fkD0ulDNYMWc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -25522,15 +25696,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25539,39 +25714,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 76288 + LdsBytesNoAmax: 90624 LdsInitCVgprs: false - LdsNumBytes: 76288 - LdsNumElementsAlignedA: 8704 + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 23040 LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 76288 - LdsOffsetMetadata_Blk: 139776 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -25580,10 +25755,10 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -25591,23 +25766,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 + MIWaveTile: [5, 4] + MIWaveTileA: 5 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 512 - MacroTileA: 64 - MacroTileB: 512 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -25618,21 +25793,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 2 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 5 NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -25651,31 +25826,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 107 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 20 ThreadTile1: 4 - ThreadTileA: 32 + ThreadTileA: 20 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -25686,49 +25862,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25736,18 +25915,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x384x32_MI32xJv85LAZjXvrheXhR6Pb2-1BpR8Zhns4NnJ21cIAl2Hw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1cxVKEoGpectXJ4hizehb-leeaygHA2aT8hzudE-aBUA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -25758,15 +25937,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25775,11 +25955,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -25788,26 +25968,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 129536 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 129536 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -25815,35 +25995,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 384 - MacroTileA: 64 - MacroTileB: 384 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -25855,21 +26035,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 2 - NumLoadsB: 12 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25878,7 +26058,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -25887,50 +26067,52 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 108 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -25938,33 +26120,35 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25972,20 +26156,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xW4r92vEXVihDdHo-lHvJ0uNEpIQK9BpFRUZ08nMbzCs= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-XbEyQ5BisbWWMjTpMs98hyJ-dhnlHSIfClgvFcUEQk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -25994,14 +26178,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -26011,50 +26196,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115712 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 115712 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26062,15 +26247,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26084,27 +26269,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -26114,7 +26299,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -26123,42 +26308,44 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 109 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -26167,23 +26354,23 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -26193,14 +26380,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26208,7 +26397,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xwGhVm6bClKczrRVvyM1JDrgb5E0uKGXdH1TpIh1TL0o= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1pFiexEy_nQA9jS434_CfI7aGjSUEPvYPE95_vzPQKTw= BufferLoad: true BufferStore: true CUCount: null @@ -26218,8 +26407,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -26230,15 +26419,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthB: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26247,50 +26437,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 17408 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26298,15 +26488,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26326,23 +26516,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 3 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26350,7 +26540,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -26359,67 +26549,69 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 110 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 0 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -26429,14 +26621,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26444,19 +26638,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x16x32_MI16x1TWdBy3MnQQKMOSifSRrwSeieAjAz_Ipy8XM9t_apH8I= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1H4SRQMHnBm8MPmn7CC4vovQuF9Klt0xNrGusZalzPig= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false @@ -26466,15 +26660,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26483,37 +26678,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 4 + LSPA: 32 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29504 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 29504 - LdsNumElementsAlignedA: 10560 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10560 - LdsOffsetB_Blk: 26944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10560 - LdsOffsetMetadata_Blk: 26944 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -26521,12 +26716,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26534,15 +26729,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [5, 1] - MIWaveTileA: 5 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 16 - MacroTileA: 80 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26562,23 +26757,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 10 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26595,84 +26790,88 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 111 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 1 - ThreadTileA: 20 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26680,7 +26879,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x7fd2kUwjm75D0kEujcLkV3_GTKT5wWnTcae2Fu7jWuw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9zD0PX1FPhAiuZDs_KjdPAn76aiPUEhRhVS2tkop5n3A= BufferLoad: true BufferStore: true CUCount: null @@ -26690,8 +26889,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -26702,14 +26901,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -26719,37 +26919,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 44544 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 79360 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 79360 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -26759,10 +26959,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26770,15 +26970,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26798,22 +26998,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26822,7 +27022,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -26831,31 +27031,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 112 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 24 ThreadTile1: 3 - ThreadTileA: 8 + ThreadTileA: 24 ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true @@ -26865,13 +27066,14 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -26881,34 +27083,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26916,7 +27120,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x80x128_MI16xEh4DCgEFCt-afK9cUIQCk_5N_QizF04nbCdMOaWqSgw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19UJ9YnHb6cR5JwasXZkP1sk4AUOzy6Nd_GCne8pVOR0= BufferLoad: true BufferStore: true CUCount: null @@ -26926,8 +27130,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -26938,16 +27142,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26956,37 +27160,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 156672 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 156672 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 43520 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 78336 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 113152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 113152 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -26996,10 +27200,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27007,15 +27211,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 5] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 80 - MacroTileA: 64 - MacroTileB: 80 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27036,21 +27240,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 20 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 + NumLoadsA: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27059,7 +27263,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27068,31 +27272,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 113 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 5 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true @@ -27102,35 +27307,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -27148,6 +27353,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27155,18 +27361,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xIjYsJ7nPzSNt9gcCFNyYcDjxOwbmoQKbvftRgAY1G4I= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wWhUNhsLXnwdUgsCNHLafWrE-J4dg946fN2_q4HV5w8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -27177,15 +27383,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -27194,38 +27400,38 @@ ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -27235,10 +27441,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27246,15 +27452,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27274,22 +27480,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 8 - NumLoadsB: 12 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27307,32 +27513,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 114 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 5 ThreadTileA: 8 - ThreadTileB: 3 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27341,14 +27548,14 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -27358,26 +27565,26 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -27387,6 +27594,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27394,18 +27602,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xG8s9vcohdFWaEPW_OKAxb_cYdQ6B4AQZ2nx9kjNOYto= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -27416,16 +27623,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27434,37 +27641,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 79872 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -27474,10 +27681,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27486,14 +27693,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27513,22 +27720,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27546,18 +27753,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 115 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -27567,11 +27774,12 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27580,43 +27788,43 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -27626,6 +27834,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27633,7 +27842,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x34z5jA9IIcOhaYTxrREvShaRXUEMAbqXkdSTqaoGUr0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT31zhAEgAY7p8a5We0BzsiowfNKry6VBUHEQOZboSmQsk= BufferLoad: true BufferStore: true CUCount: null @@ -27644,7 +27853,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -27655,15 +27864,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -27673,11 +27882,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 LSCA: 128 LSCB: 128 LSPA: 8 @@ -27689,34 +27898,34 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27724,15 +27933,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [2, 3] MIWaveTileA: 2 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27753,21 +27962,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27776,7 +27985,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27785,48 +27994,49 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 116 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 ThreadTile1: 3 ThreadTileA: 8 ThreadTileB: 3 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -27836,18 +28046,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -27865,6 +28075,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27872,7 +28083,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x9MB_BItjxFkx0YR1b98tLWirgDOvhTRoD-0flsTcQhA= BufferLoad: true BufferStore: true CUCount: null @@ -27882,8 +28092,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -27894,16 +28104,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27912,50 +28122,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 81920 LdsInitCVgprs: false - LdsNumBytes: 86016 + LdsNumBytes: 81920 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 + LdsOffsetA_Blk: 65536 LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27963,15 +28173,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27992,21 +28202,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 NumLoadsA: 8 - NumLoadsB: 12 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28015,7 +28225,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28024,32 +28234,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 117 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28058,43 +28269,43 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -28104,6 +28315,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28111,7 +28323,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI32o1cp8AHeO1wh-BhDUt3-ptcU4Mtb1M0g3Gvr_oirH9Q= BufferLoad: true BufferStore: true CUCount: null @@ -28121,8 +28332,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -28131,18 +28342,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthB: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28151,39 +28362,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 101376 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 262144 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 295936 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 135168 - LdsOffsetMetadata_Blk: 295936 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -28191,35 +28402,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -28230,22 +28441,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 24 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 24 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28253,8 +28464,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28263,31 +28474,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 118 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 3 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true @@ -28297,17 +28509,17 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -28315,17 +28527,17 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -28343,6 +28555,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28350,7 +28563,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI323I4N4iBo8DW3RIRd0BHqSQeGBeqa7daq5LoZFfvEd3I= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45Pf77PZ0y2a63oqBv5xuSEpvYfkv5rrfz2OggkgqUbU= BufferLoad: true BufferStore: true CUCount: null @@ -28360,8 +28573,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -28370,17 +28583,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -28390,39 +28603,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 101376 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 262144 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 295936 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 135168 - LdsOffsetMetadata_Blk: 295936 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -28430,35 +28643,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -28470,21 +28683,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 24 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 24 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28492,8 +28705,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28502,18 +28715,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 119 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -28523,11 +28736,12 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28536,35 +28750,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -28578,10 +28792,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28589,7 +28804,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1Fd3Wgq88OZ1PaojzfzjpDsCp8WKZiHDJo9GXAXrgRK0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-PpbdGeEL52tGWSKuV1d3aERKjhvbI9dAbn_gKJooqY= BufferLoad: true BufferStore: true CUCount: null @@ -28599,8 +28814,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -28611,16 +28826,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28629,37 +28844,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 59904 LdsInitCVgprs: false LdsNumBytes: 59904 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 51200 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -28669,10 +28884,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -28680,15 +28895,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 320 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28710,23 +28925,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -28734,7 +28947,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28743,38 +28956,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 120 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -28782,32 +28996,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -28815,8 +29026,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -28826,6 +29037,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28833,7 +29045,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x64x64_MI16x1oxplz8DgKAzzHI9atKMOCqO_fHvi4aY6eI_59fmy5x0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3v1NwFnVXJTJCep_voPt7RyBBkZRSCMUMhO_N-SZ5gL8= BufferLoad: true BufferStore: true CUCount: null @@ -28843,8 +29055,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -28855,16 +29067,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28873,19 +29085,19 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 @@ -28913,10 +29125,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -28925,14 +29137,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28955,13 +29167,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -28969,8 +29181,6 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -28978,7 +29188,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28987,32 +29197,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 121 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29026,32 +29237,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -29070,6 +29278,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29077,7 +29286,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1Y2_PBeJGjEXXPI1_8Q1nplFuMWjj1kVwTjC8QiIVYG0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3OvWxntshUWW3_A-VI6XNSghru3U4UDSTDhjcjVR3HKY= BufferLoad: true BufferStore: true CUCount: null @@ -29086,9 +29295,9 @@ CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -29099,16 +29308,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29117,37 +29326,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29157,10 +29366,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29168,15 +29377,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29203,18 +29412,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -29222,7 +29429,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -29231,36 +29438,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 122 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -29270,32 +29478,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -29310,10 +29515,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29321,20 +29527,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x96x64_MI16x6H73GL1jU92BhYVu1-N3hGNc1zsprzx_b60KuCSUZ8M= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6C54sGtPjCJ3V-cPRAE3Ns8NHt0h_voSZTrp24ROARTw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -29343,15 +29549,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -29361,37 +29567,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29399,12 +29605,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29412,15 +29618,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29434,31 +29640,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 6 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -29466,7 +29670,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -29475,38 +29679,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 123 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 3 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -29514,50 +29719,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29565,7 +29768,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x64_MI161e0FqGLAMztP_dD6xxGdXRut9vwmQsZsotL-hvEenUg= BufferLoad: true BufferStore: true CUCount: null @@ -29576,9 +29778,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -29587,15 +29789,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -29605,11 +29807,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -29621,21 +29823,21 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152064 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 152064 + LdsNumBytes: 67584 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 42240 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 76032 + LdsOffsetA_Blk: 131072 LdsOffsetB: 33792 - LdsOffsetB_Blk: 109824 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 109824 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29643,12 +29845,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29657,14 +29859,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 160 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29678,31 +29880,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 10 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 10 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -29719,38 +29919,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 124 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 5 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -29758,33 +29959,30 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -29796,12 +29994,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29809,18 +30008,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT320x192x32_MI328ZzXv-gq9j9I9S9bJzI6udNnruDK5iaoddi5jgQlSzk= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6uAdk8COtu0uSDFSBZJBI7WxTlIvedN3IPIedLDwGas0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -29831,16 +30030,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29849,11 +30048,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -29862,26 +30061,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 119808 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 119808 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -29889,35 +30088,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 3] - MIWaveTileA: 5 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -29930,23 +30129,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 240 - NumLoadsA: 10 - NumLoadsB: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -29954,7 +30151,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -29963,18 +30160,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 125 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -29984,15 +30181,16 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 3 - ThreadTileA: 80 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -30002,32 +30200,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -30046,6 +30241,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30053,18 +30249,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x64_MI16HbQjL6Xiy4_bwYKGnfEzEBO73K7v6GCjIcyS997kha4= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xL-j8Nchxvh82x3gA1ZugFs4j6lELgVzV2fA-hmOsN8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -30075,16 +30271,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30093,11 +30289,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -30106,24 +30302,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 79872 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 79872 - LdsNumElementsAlignedA: 46080 + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 79872 - LdsOffsetMetadata_Blk: 177152 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -30136,7 +30332,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -30145,13 +30341,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 160 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -30179,18 +30375,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30207,31 +30401,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 126 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 16 ThreadTile1: 4 - ThreadTileA: 20 + ThreadTileA: 16 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -30246,15 +30441,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -30262,16 +30454,16 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -30286,10 +30478,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30297,20 +30490,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3vs1Q68Ny_keXqIMp2Wsj6oT9pYijob5fQFPcck7RhE0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -30319,16 +30511,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30337,104 +30529,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 94720 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 94720 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 71680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16640 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 94720 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 7] + MIWaveTileA: 10 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 160 + MacroTile1: 448 + MacroTileA: 160 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 5 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 14 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30442,7 +30632,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30451,18 +30641,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 127 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -30472,11 +30662,12 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 40 + ThreadTile1: 7 + ThreadTileA: 40 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -30490,33 +30681,30 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -30528,12 +30716,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30541,20 +30730,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3DYvE-qV_7GGecMY1_JFL64PBZq2Vbkv4V5zJot8g6OQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -30563,16 +30751,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30581,104 +30769,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16640 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30686,7 +30872,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30695,32 +30881,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 128 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -30734,50 +30921,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30785,18 +30970,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1OBoPYQkKxqQdcTYaOp3l8VgmoDSmZ_j5vyKbG-QUUJg= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1EcUjFXB929D0f2TV76fVey2j0aZunr_I-f76y9iEKD8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -30807,15 +30992,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -30825,37 +31010,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -30865,10 +31050,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -30876,14 +31061,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -30906,23 +31091,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30930,7 +31113,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30939,38 +31122,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 129 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -30978,32 +31162,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -31011,17 +31192,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31029,20 +31211,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3EUpHQs_hYzAB34YWQur8CNtLVfZKv6w_F6Q0El8csoA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -31051,16 +31232,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31069,104 +31250,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16640 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 256 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31174,7 +31353,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -31183,18 +31362,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 130 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -31204,15 +31383,16 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -31222,50 +31402,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31273,18 +31451,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x96x64_MI16x9ICG4gPhzi9OmT20CRMj-3n344OmlrV6OXzawY3nTvU= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6mLBE_Dn-_3QTzaoio4b0fzaYzT15iyos9J3lxNRut4U= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -31295,16 +31473,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31313,37 +31491,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 24576 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -31353,10 +31531,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31365,13 +31543,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 + MIWaveTile: [2, 3] + MIWaveTileA: 2 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 96 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -31393,24 +31571,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31418,7 +31594,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -31427,36 +31603,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 131 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 3 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -31466,15 +31643,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -31482,21 +31656,21 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -31510,6 +31684,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31517,18 +31692,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16x0cYNusgHgF3Co_ShFYNVgf_9fASJQBYDTRah_BV_zEY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3UA89VtDjVMYF_nVEcLHiT6ehHteA9v56y_f3v5d_CYI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -31537,18 +31712,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31557,50 +31732,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31608,15 +31783,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31638,31 +31813,29 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -31671,36 +31844,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 132 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -31710,32 +31884,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -31743,17 +31914,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31761,20 +31933,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x64_MI16MWaFtORgD4eC8ES5PuaeS70bQdnSFwME3aBH6fKhp6A= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -31783,15 +31954,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -31801,37 +31972,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152064 + LdsBytesNoAmax: 100864 LdsInitCVgprs: false - LdsNumBytes: 152064 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 42240 + LdsNumBytes: 100864 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 92160 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 76032 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 109824 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 109824 + LdsOffsetMetadata: 100864 + LdsOffsetMetadata_Blk: 139776 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -31839,12 +32010,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31852,15 +32023,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] + MIWaveGroup: [1, 4] + MIWaveTile: [4, 9] MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTileB: 9 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 + MacroTile0: 64 + MacroTile1: 576 + MacroTileA: 64 + MacroTileB: 576 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31874,31 +32045,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 2 + NumLoadsB: 18 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 18 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 10 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31906,7 +32075,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -31915,38 +32084,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 133 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 5 + ThreadTile1: 9 ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTileB: 9 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -31954,12 +32124,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -31969,35 +32136,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32005,7 +32173,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xuZLNd47w3L6zFk9Cg2oSluavIaHhVBvshWxewFaNb8o= BufferLoad: true BufferStore: true CUCount: null @@ -32015,8 +32182,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -32025,18 +32192,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32045,75 +32212,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -32126,31 +32293,29 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32159,32 +32324,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 134 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32198,32 +32364,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -32231,17 +32394,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32249,20 +32413,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xWU2YXZSq42Zyg4I1M1bqQl8lWMhuX8bgcxNNF-WMw3U= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hbz-dyf7FwiPWJ4x45PK0ZJquMKV4n_z15-8SqAfiXQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -32271,16 +32435,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32289,37 +32453,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115712 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 115712 + LdsNumBytes: 66560 LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 131072 LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -32327,12 +32491,12 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -32341,14 +32505,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32362,31 +32526,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32394,7 +32556,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32403,18 +32565,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 135 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM16_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -32424,11 +32586,12 @@ SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32442,32 +32605,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -32480,12 +32640,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32493,18 +32654,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x64_MI32xPv1b-quWnWZzrEFikvQdugHi9IL2VdsODF63jr62UQ4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -32513,18 +32673,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32533,39 +32693,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 76288 + LdsBytesNoAmax: 123392 LdsInitCVgprs: false - LdsNumBytes: 76288 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 76288 - LdsOffsetMetadata_Blk: 139776 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -32573,11 +32733,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -32585,23 +32745,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 160 MacroTile1: 256 - MacroTileA: 32 + MacroTileA: 160 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -32613,32 +32773,30 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 16 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32647,38 +32805,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 136 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -32686,32 +32845,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -32719,17 +32875,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32737,20 +32894,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Rdlq7Rc2vP_yhCpcNjpQdCrFtftCCMe2J1b1mDh_IUI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -32759,16 +32915,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32777,104 +32933,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 51200 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16640 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 320 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32882,7 +33036,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32891,18 +33045,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 137 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -32912,11 +33066,12 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 5 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32930,33 +33085,30 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -32968,12 +33120,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32981,18 +33134,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3CZS8qX8a5jcy85YI3uVSITdEfZd1y3IMs4eLTwn3ELQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT67lNyUdTmw-cOQZDz4eeTMX4-DhVCJMsSaMIubs8z554= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -33003,16 +33156,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -33021,39 +33174,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -33061,35 +33214,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 256 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -33101,24 +33254,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -33126,7 +33277,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -33135,18 +33286,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 138 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -33156,11 +33307,12 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33174,32 +33326,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -33218,6 +33367,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -33225,7 +33375,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xFyWqcs6KVFJkQ8kLhxkp503bjalLMrD-hQMHlzpWf2g= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6et-9-ykpgaai9G-L1VaR5RDg1xNbnI7j11JfZuQQm7o= BufferLoad: true BufferStore: true CUCount: null @@ -33236,7 +33386,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -33245,17 +33395,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -33265,11 +33415,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -33279,23 +33429,23 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -33308,7 +33458,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -33316,14 +33466,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -33346,30 +33496,28 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -33379,32 +33527,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 139 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33418,32 +33567,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -33460,285 +33606,297 @@ tailLoopOptB: true - [2, 3, 0, 1] - - - [233, 128, 1024, 32] - - [104, 0.0] + - [90, 17.77] - - [512, 8192, 1, 3072] - [0, 0.0] - - [512, 8192, 1, 3960] - - [60, 0.0] + - [44, 0.0] - - [512, 8192, 1, 5640] - - [61, 0.0] + - [45, 0.0] - - [528, 8192, 1, 256] - - [62, 0.0] + - [106, 57.71] - - [528, 8192, 1, 512] - - [1, 0.0] + - [107, 74.32] - - [1024, 8192, 1, 1980] - - [2, 0.0] + - [1, 0.0] - - [1024, 8192, 1, 3840] - - [3, 0.0] + - [2, 0.0] - - [2440, 8192, 1, 128] - - [63, 0.0] + - [93, 59.84] - - [5640, 8192, 1, 128] - - [4, 0.0] + - [108, 66.59] - - [61, 128, 8192, 40] - - [64, 0.0] + - [109, 14.89] - - [128, 30, 8192, 4] - - [5, 0.0] + - [3, 0.0] - - [128, 33, 8192, 16] - - [134, 14661.8] + - [110, 8.65] - - [128, 61, 8192, 40] - - [65, 0.0] + - [111, 18.88] - - [41, 17711, 1, 128] - - [121, 22531.0] + - [116, 10.94] - - [96, 17711, 1, 768] - - [6, 0.0] + - [4, 0.0] - - [256, 17711, 1, 887] - - [124, 166959.0] + - [73, 166959.0] - - [384, 17711, 1, 2732] - - [7, 0.0] + - [5, 0.0] - - [960, 17711, 1, 128] - - [90, 0.0] + - [112, 55.24] - - [2480, 17711, 1, 128] - - [125, 107650.0] + - [113, 63.59] - - [48, 124, 17711, 20] - - [8, 0.0] + - [6, 0.0] - - [128, 17711, 6, 128] - - [103, 0.0] + - [55, 0.0] - - [10, 655360, 1, 160] - - [9, 0.0] + - [7, 0.0] - - [28, 4096, 1, 256] - - [135, 11097.5] + - [75, 11097.5] - - [32, 262144, 1, 57] - - [136, 34735.6] + - [122, 17.74] - - [32, 262144, 1, 60] - - [66, 0.0] + - [121, 17.58] - - [32, 262144, 1, 82] - - [10, 0.0] + - [8, 0.0] - - [32, 262144, 1, 84] - - [11, 0.0] + - [9, 0.0] - - [48, 655360, 1, 192] - - [12, 0.0] + - [10, 0.0] - - [57, 4096, 1, 2048] - - [68, 0.0] + - [132, 27.48] - - [64, 4096, 1, 2048] - - [13, 0.0] + - [132, 29.94] - - [64, 102400, 1, 64] - - [69, 0.0] + - [46, 0.0] - - [64, 131072, 1, 128] - - [14, 0.0] + - [133, 40.14] - - [64, 527553, 1, 224] - - [15, 0.0] + - [138, 54.03] - - [64, 752863, 1, 224] - - [71, 0.0] + - [137, 55.19] - - [64, 806154, 1, 288] - - [72, 0.0] + - [125, 57.26] - - [72, 4096, 1, 256] - - [16, 0.0] + - [11, 0.0] - - [82, 4096, 1, 2048] - - [17, 0.0] + - [12, 0.0] - - [112, 655360, 1, 192] - - [18, 0.0] + - [13, 0.0] - - [116, 4096, 1, 256] - - [19, 0.0] + - [14, 0.0] - - [128, 4096, 1, 1600] - - [20, 0.0] + - [135, 40.1] - - [128, 131072, 1, 64] - - [21, 0.0] + - [15, 0.0] - - [160, 4096, 1, 512] - - [22, 0.0] + - [134, 28.98] - - [160, 4096, 1, 2048] - - [80, 0.0] + - [48, 0.0] - - [180, 4096, 1, 256] - - [23, 0.0] + - [16, 0.0] - - [256, 4096, 1, 28] - - [24, 0.0] + - [17, 0.0] - - [256, 4096, 1, 72] - - [25, 0.0] + - [18, 0.0] - - [256, 4096, 1, 116] - - [26, 0.0] + - [19, 0.0] - - [256, 4096, 1, 256] - - [137, 63493.6] + - [76, 63493.6] - - [256, 4096, 1, 4132] - - [83, 0.0] + - [128, 73.06] - - [256, 4096, 1, 7680] - - [27, 0.0] + - [20, 0.0] - - [304, 655360, 1, 644] - - [28, 0.0] + - [127, 94.46] - - [320, 4096, 1, 116] - - [29, 0.0] + - [21, 0.0] - - [320, 4096, 1, 180] - - [30, 0.0] + - [22, 0.0] - - [512, 4096, 1, 96] - - [31, 0.0] + - [23, 0.0] - - [512, 4096, 1, 160] - - [32, 0.0] + - [24, 0.0] - - [512, 4096, 1, 512] - - [33, 0.0] + - [25, 0.0] - - [512, 4096, 1, 2246] - - [34, 0.0] + - [26, 0.0] - - [512, 4096, 1, 4132] - - [89, 0.0] + - [126, 93.58] - - [512, 4096, 1, 7680] - - [35, 0.0] + - [124, 105.67] - - [2048, 4096, 1, 128] - - [93, 0.0] + - [108, 53.21] - - [2048, 4096, 1, 2048] - - [36, 0.0] + - [27, 0.0] - - [2048, 4096, 1, 2624] - - [37, 0.0] + - [28, 0.0] - - [2246, 4096, 1, 512] - - [38, 0.0] + - [29, 0.0] - - [2560, 4096, 1, 4096] - - [94, 0.0] + - [136, 133.67] - - [25, 25, 8192, 32] - - [96, 0.0] + - [53, 0.0] - - [32, 25, 8192, 25] - - [97, 0.0] + - [54, 0.0] - - [64, 57, 4096, 32] - - [98, 0.0] + - [102, 15.56] - - [64, 82, 4096, 32] - - [99, 0.0] + - [131, 18.45] - - [160, 642, 4096, 48] - - [39, 0.0] + - [30, 0.0] - - [200, 32, 4096, 64] - - [40, 0.0] + - [129, 18.97] - - [642, 160, 4096, 48] - - [41, 0.0] + - [31, 0.0] - - [128, 2048, 1, 256] - - [42, 0.0] + - [32, 0.0] - - [128, 2048, 1, 1024] - - [79, 0.0] + - [47, 0.0] - - [256, 2048, 1, 32] - - [43, 0.0] + - [33, 0.0] - - [256, 2048, 1, 36] - - [44, 0.0] + - [34, 0.0] - - [256, 2048, 1, 40] - - [45, 0.0] + - [35, 0.0] - - [256, 2048, 1, 48] - - [46, 0.0] + - [36, 0.0] - - [256, 2048, 1, 64] - - [120, 13745.7] + - [71, 13745.7] - - [256, 2048, 1, 72] - - [47, 0.0] + - [37, 0.0] - - [256, 2048, 1, 80] - - [48, 0.0] + - [38, 0.0] - - [256, 2048, 1, 96] - - [49, 0.0] + - [39, 0.0] - - [256, 2048, 1, 128] - - [50, 0.0] + - [40, 0.0] - - [256, 2048, 1, 256] - - [51, 0.0] + - [41, 0.0] - - [512, 2048, 1, 14336] - - [87, 0.0] + - [50, 0.0] - - [120, 8192, 1, 256] - - [127, 56756.2] + - [94, 24.98] - - [128, 8192, 1, 512] - - [130, 93560.3] + - [97, 41.14] - - [128, 8192, 1, 4352] - - [52, 0.0] + - [98, 73.84] - - [128, 8192, 1, 5120] - - [53, 0.0] + - [99, 76.51] - - [128, 8192, 1, 7296] - - [54, 0.0] + - [42, 0.0] - - [128, 98304, 1, 256] - - [131, 129793.0] + - [100, 73.12] - - [256, 8192, 1, 120] - - [55, 0.0] + - [101, 27.41] - - [256, 8192, 1, 128] - - [56, 0.0] + - [102, 32.62] - - [256, 8192, 1, 512] - - [57, 0.0] + - [103, 58.9] - - [256, 8192, 1, 4352] - - [58, 0.0] + - [43, 0.0] - - [512, 8192, 1, 1024] - - [133, 200366.0] + - [104, 90.42] - - [512, 8192, 1, 2048] - - [59, 0.0] + - [105, 102.4] - - [56, 131072, 1, 233] - - [67, 0.0] + - [78, 31.76] - - [64, 131072, 1, 64] - - [70, 0.0] + - [123, 36.24] - - [128, 1024, 1, 64] - - [73, 0.0] + - [84, 1.72] - - [128, 1024, 1, 72] - - [74, 0.0] + - [87, 1.7] - - [128, 1024, 1, 96] - - [75, 0.0] + - [92, 2.37] - - [128, 1024, 1, 128] - - [76, 0.0] + - [79, 3.08] - - [128, 1024, 1, 144] - - [77, 0.0] + - [85, 2.98] - - [128, 1024, 1, 4096] - - [78, 0.0] + - [82, 31.71] - - [128, 17711, 1, 128] - - [122, 58658.9] + - [72, 58658.9] - - [256, 1024, 1, 7968] - - [81, 0.0] + - [86, 58.1] - - [256, 4096, 1, 180] - - [82, 0.0] + - [101, 20.26] - - [320, 4096, 1, 28] - - [84, 0.0] + - [49, 0.0] - - [320, 4096, 1, 72] - - [85, 0.0] + - [131, 11.57] - - [512, 1024, 1, 2011] - - [86, 0.0] + - [83, 46.44] - - [512, 4096, 1, 80] - - [88, 0.0] + - [51, 0.0] - - [1024, 2048, 1, 14336] - - [91, 0.0] + - [52, 0.0] - - [2011, 1024, 1, 512] - - [92, 0.0] + - [80, 58.83] - - [7456, 1024, 1, 128] - - [95, 0.0] + - [89, 58.56] - - [64, 4096, 96, 160] - - [100, 0.0] + - [130, 45.87] - - [124, 48, 17711, 20] - - [101, 0.0] + - [118, 10.87] - - [128, 233, 1024, 32] - - [102, 0.0] + - [93, 25.13] - - [64, 9419, 1, 5120] - - [114, 0.0] + - [65, 0.0] - - [64, 9420, 1, 5120] - - [105, 0.0] + - [56, 0.0] - - [64, 18389, 1, 5120] - - [106, 0.0] + - [57, 0.0] - - [64, 18392, 1, 5120] - - [107, 0.0] + - [58, 0.0] - - [64, 21090, 1, 5120] - - [117, 0.0] + - [68, 0.0] - - [64, 21092, 1, 5120] - - [108, 0.0] + - [59, 0.0] - - [5120, 1, 1, 256] - - [109, 0.0] + - [60, 0.0] - - [5120, 1, 1, 5120] - - [110, 0.0] + - [61, 0.0] - - [30720, 1, 1, 5120] - - [111, 0.0] + - [62, 0.0] - - [64, 4106, 1, 5120] - - [112, 0.0] + - [63, 0.0] - - [64, 4200, 1, 5120] - - [113, 0.0] + - [64, 0.0] - - [64, 9450, 1, 5120] - - [115, 0.0] + - [66, 0.0] - - [64, 9452, 1, 5120] - - [116, 0.0] + - [67, 0.0] - - [64, 21263, 1, 5120] - - [118, 0.0] + - [69, 0.0] - - [64, 21264, 1, 5120] - - [119, 0.0] + - [70, 0.0] - - [128, 17711, 1, 928] - - [123, 144111.0] + - [117, 60.29] - - [17711, 246, 1, 384] - - [126, 123978.0] + - [115, 63.33] - - [120, 8192, 1, 512] - - [128, 83879.6] + - [95, 36.23] - - [128, 8192, 1, 64] - - [129, 22804.1] + - [96, 10.73] - - [512, 8192, 1, 256] - - [132, 128855.0] + - [74, 128855.0] - - [512, 4096, 1, 64] - - [138, 37015.4] + - [139, 18.43] - - [4096, 1024, 1, 128] - - [139, 87046.7] + - [77, 87046.7] + - - [128, 1024, 1, 512] + - [81, 9.27] + - - [128, 1024, 1, 256] + - [88, 5.58] + - - [7968, 1024, 1, 256] + - [91, 77.41] + - - [128, 17711, 1, 256] + - [114, 41.1] + - - [41, 128, 17711, 6] + - [119, 3.24] + - - [64, 819200, 1, 64] + - [120, 34.01] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml new file mode 100644 index 00000000000..bbe51f2e23c --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml @@ -0,0 +1,15142 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DataTypeA: 0 + DataTypeAmaxD: 0 + DataTypeB: 0 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 10 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9yCMUvmmmKthOJiKqIB_mYxsvQAAqv6o_A39DiLPiA4s= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 15360 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 15360 + LdsOffsetB_Blk: 48128 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 48128 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT8eZn8BuF1Nziw24iNKnDP44-Wc-OfwT54KqMWpiGeWHs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 82432 + LdsInitCVgprs: false + LdsNumBytes: 82432 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 69632 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 82432 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 512 + MacroTileA: 80 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3ZQt4emcWG8guHWikbv6OxFJy790l58gtvM3nfZjMIJE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3e4Dw_hz57yPEZN_qoLaorfGepQNCz75gt6VQs5_mgZo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nB1C3iOYxU_7DUOjuOyODiZ9rVSorIjLm4u2U10uDaw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3SwF0bvQxB0RrhRxtPMVt3TyizF16j4vW99jq_X9KpHk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30208 + LdsInitCVgprs: false + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3YeRtOHUsp9ttyStfNKrKENY_vaUUjqgLCrwM91_2-aY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3cSFHPElhrbZabUpjPK0idMvlUk8E6jLU5EZAOZd0T7g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30208 + LdsInitCVgprs: false + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1I2reK521Z0OJ8WcGapE1-BfPXpcGPiwVna1dA3-Ll3c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ULSHV5TvKv-3s7N5kPB29fM38e7xZ96gnPlfW9sr_dw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13312 + LdsInitCVgprs: false + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Gtebt9rQIpwe9oFGHoWcPG_t3BC0dLSpyLHC5C0Kq0M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6V3GWrV8q1KT6qcNF3NRQe6SaKzZRXLdh2CbWinGWLAU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zsxRwkMqBm0-RotUdPfZ2aFGnmO-YpXpopyCt-rV2s4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67072 + LdsInitCVgprs: false + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HCXW7Md-kYsZ-1UnmrrC7uG9YBlO7z5sEs9KMCRnZRY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6oabzLLQwOO03l3MEmNqlhJ6dLm9NnzDdPu-7gnuHCJQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99840 + LdsInitCVgprs: false + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wZn1lj5-xh86AxV8XC_TfIrJlVoj_3cWBkWnmMVsin8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 72704 + LdsInitCVgprs: false + LdsNumBytes: 72704 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 55296 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 72704 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65gE-MCZm4_zHo6C5uZDRUoKD8EXmAPJ3N4uk_Y8yfys= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 + LdsInitCVgprs: false + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6I8O4paswkmb66D5oBp-XVQKwsaElMfkk9UWM0xAgsVs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 + LdsInitCVgprs: false + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHDQbHdXWrhDPz3uAOriInqMw0_ypUmAB2yHcbrln9g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT189iqYKY55VEcPzDOXv9ylhp1p78P2DCBr2S6_6nafkw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 90624 + LdsInitCVgprs: false + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1cxVKEoGpectXJ4hizehb-leeaygHA2aT8hzudE-aBUA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-XbEyQ5BisbWWMjTpMs98hyJ-dhnlHSIfClgvFcUEQk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1pFiexEy_nQA9jS434_CfI7aGjSUEPvYPE95_vzPQKTw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 7680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1H4SRQMHnBm8MPmn7CC4vovQuF9Klt0xNrGusZalzPig= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9zD0PX1FPhAiuZDs_KjdPAn76aiPUEhRhVS2tkop5n3A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 30720 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 79360 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 79360 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19UJ9YnHb6cR5JwasXZkP1sk4AUOzy6Nd_GCne8pVOR0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wWhUNhsLXnwdUgsCNHLafWrE-J4dg946fN2_q4HV5w8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57856 + LdsInitCVgprs: false + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT31zhAEgAY7p8a5We0BzsiowfNKry6VBUHEQOZboSmQsk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81920 + LdsInitCVgprs: false + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 7680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45Pf77PZ0y2a63oqBv5xuSEpvYfkv5rrfz2OggkgqUbU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-PpbdGeEL52tGWSKuV1d3aERKjhvbI9dAbn_gKJooqY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 51200 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3v1NwFnVXJTJCep_voPt7RyBBkZRSCMUMhO_N-SZ5gL8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 + LdsInitCVgprs: false + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3OvWxntshUWW3_A-VI6XNSghru3U4UDSTDhjcjVR3HKY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 + LdsInitCVgprs: false + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6C54sGtPjCJ3V-cPRAE3Ns8NHt0h_voSZTrp24ROARTw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6uAdk8COtu0uSDFSBZJBI7WxTlIvedN3IPIedLDwGas0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xL-j8Nchxvh82x3gA1ZugFs4j6lELgVzV2fA-hmOsN8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 94720 + LdsInitCVgprs: false + LdsNumBytes: 94720 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 71680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 94720 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 7] + MIWaveTileA: 10 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 448 + MacroTileA: 160 + MacroTileB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 5 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 7 + ThreadTileA: 40 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1EcUjFXB929D0f2TV76fVey2j0aZunr_I-f76y9iEKD8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 + LdsInitCVgprs: false + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6mLBE_Dn-_3QTzaoio4b0fzaYzT15iyos9J3lxNRut4U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3UA89VtDjVMYF_nVEcLHiT6ehHteA9v56y_f3v5d_CYI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100864 + LdsInitCVgprs: false + LdsNumBytes: 100864 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 92160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 100864 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 576 + MacroTileA: 64 + MacroTileB: 576 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 2 + NumLoadsB: 18 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 18 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hbz-dyf7FwiPWJ4x45PK0ZJquMKV4n_z15-8SqAfiXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123392 + LdsInitCVgprs: false + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 51200 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT67lNyUdTmw-cOQZDz4eeTMX4-DhVCJMsSaMIubs8z554= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6et-9-ykpgaai9G-L1VaR5RDg1xNbnI7j11JfZuQQm7o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35328 + LdsInitCVgprs: false + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true +- [2, 3, 0, 1] +- - - [233, 128, 1024, 32] + - [0, 17.77] + - - [56, 131072, 1, 233] + - [1, 31.76] + - - [128, 1024, 1, 128] + - [2, 3.08] + - - [2011, 1024, 1, 512] + - [3, 58.83] + - - [128, 1024, 1, 512] + - [4, 9.27] + - - [128, 1024, 1, 4096] + - [5, 31.71] + - - [512, 1024, 1, 2011] + - [6, 46.44] + - - [128, 1024, 1, 64] + - [7, 1.72] + - - [128, 1024, 1, 144] + - [8, 2.98] + - - [256, 1024, 1, 7968] + - [9, 58.1] + - - [128, 1024, 1, 72] + - [10, 1.7] + - - [128, 1024, 1, 256] + - [11, 5.58] + - - [7456, 1024, 1, 128] + - [12, 58.56] + - - [7968, 1024, 1, 256] + - [13, 77.41] + - - [128, 1024, 1, 96] + - [14, 2.37] + - - [128, 233, 1024, 32] + - [15, 25.13] + - - [120, 8192, 1, 256] + - [16, 24.98] + - - [120, 8192, 1, 512] + - [17, 36.23] + - - [128, 8192, 1, 64] + - [18, 10.73] + - - [128, 8192, 1, 512] + - [19, 41.14] + - - [128, 8192, 1, 4352] + - [20, 73.84] + - - [128, 8192, 1, 5120] + - [21, 76.51] + - - [128, 98304, 1, 256] + - [22, 73.12] + - - [256, 8192, 1, 120] + - [23, 27.41] + - - [256, 8192, 1, 128] + - [24, 32.62] + - - [256, 8192, 1, 512] + - [25, 58.9] + - - [512, 8192, 1, 1024] + - [26, 90.42] + - - [512, 8192, 1, 2048] + - [27, 102.4] + - - [528, 8192, 1, 256] + - [28, 57.71] + - - [528, 8192, 1, 512] + - [29, 74.32] + - - [2440, 8192, 1, 128] + - [15, 59.84] + - - [5640, 8192, 1, 128] + - [30, 66.59] + - - [61, 128, 8192, 40] + - [31, 14.89] + - - [128, 33, 8192, 16] + - [32, 8.65] + - - [128, 61, 8192, 40] + - [33, 18.88] + - - [960, 17711, 1, 128] + - [34, 55.24] + - - [2480, 17711, 1, 128] + - [35, 63.59] + - - [128, 17711, 1, 256] + - [36, 41.1] + - - [17711, 246, 1, 384] + - [37, 63.33] + - - [41, 17711, 1, 128] + - [38, 10.94] + - - [128, 17711, 1, 928] + - [39, 60.29] + - - [124, 48, 17711, 20] + - [40, 10.87] + - - [41, 128, 17711, 6] + - [41, 3.24] + - - [64, 819200, 1, 64] + - [42, 34.01] + - - [32, 262144, 1, 60] + - [43, 17.58] + - - [32, 262144, 1, 57] + - [44, 17.74] + - - [64, 131072, 1, 64] + - [45, 36.24] + - - [512, 4096, 1, 7680] + - [46, 105.67] + - - [64, 806154, 1, 288] + - [47, 57.26] + - - [512, 4096, 1, 4132] + - [48, 93.58] + - - [304, 655360, 1, 644] + - [49, 94.46] + - - [256, 4096, 1, 4132] + - [50, 73.06] + - - [64, 57, 4096, 32] + - [24, 15.56] + - - [200, 32, 4096, 64] + - [51, 18.97] + - - [64, 4096, 96, 160] + - [52, 45.87] + - - [64, 82, 4096, 32] + - [53, 18.45] + - - [64, 4096, 1, 2048] + - [54, 29.94] + - - [2048, 4096, 1, 128] + - [30, 53.21] + - - [64, 131072, 1, 128] + - [55, 40.14] + - - [160, 4096, 1, 512] + - [56, 28.98] + - - [128, 4096, 1, 1600] + - [57, 40.1] + - - [2560, 4096, 1, 4096] + - [58, 133.67] + - - [320, 4096, 1, 72] + - [53, 11.57] + - - [64, 752863, 1, 224] + - [59, 55.19] + - - [64, 527553, 1, 224] + - [60, 54.03] + - - [57, 4096, 1, 2048] + - [54, 27.48] + - - [512, 4096, 1, 64] + - [61, 18.43] + - - [256, 4096, 1, 180] + - [23, 20.26] +- null +- null +- DeviceEfficiency +- Equality From bbb104877ff251af1c2f005607368c835be601dc Mon Sep 17 00:00:00 2001 From: b-shi Date: Mon, 10 Nov 2025 14:32:01 -0600 Subject: [PATCH 2/3] Disable GNLC for usesgprgro=1 --- ..._Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml | 16 ++++++++-------- .../gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml | 16 ++++++++-------- ..._Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml | 4 ++-- .../gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml | 4 ++-- .../Tensile/SolutionStructs/Solution.py | 3 +++ 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index ddd3f13a5e9..82d82242e58 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -29532,7 +29532,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -30012,7 +30012,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -30732,7 +30732,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -30972,7 +30972,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -31930,7 +31930,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -33130,7 +33130,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -33370,7 +33370,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -33610,7 +33610,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml index 53d68a7b194..1acb1b7b293 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml @@ -7285,7 +7285,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -7765,7 +7765,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -8485,7 +8485,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -8725,7 +8725,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -9683,7 +9683,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -10883,7 +10883,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -11123,7 +11123,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -11363,7 +11363,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index 5c585965170..21facc71fb2 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -27480,7 +27480,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -27720,7 +27720,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml index 6b0e5a39bae..1fdae462c24 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml @@ -14303,7 +14303,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -14543,7 +14543,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index 5b9c3e6ff19..74a7f0f450a 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -2017,6 +2017,9 @@ def calSwizzlePackK(state, tc): reject(state, printRejectionReason, "GRVWB * DataTypeB.numBytes() > 16") disableGNLC = False # Set to true to disable GNLC if needed + # Temporary hack, if usesgprforgro is set to 1 to save vgprs, disable GNLC + if state["UseSgprForGRO"] == 1: + disableGNLC = True isMixedPrec = (state["ProblemType"]["DataTypeA"].numBytes() != state["ProblemType"]["DataTypeB"].numBytes()) if state["DirectToLds"] and state["LocalSplitU"] == 1 \ and not isMixedPrec and not state["ProblemType"]["Sparse"] \ From 0090202ec2ac66e79918d0ae22db5def52c32eb9 Mon Sep 17 00:00:00 2001 From: Peter Date: Tue, 11 Nov 2025 02:19:14 -0600 Subject: [PATCH 3/3] update kernel for usesgprforgro=1 --- ...lk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml | 13975 +++++------ ...gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml | 3773 +-- ...lk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml | 13995 ++++++----- ...gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml | 3377 ++- ...ik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml | 19135 +++++++++------- ...gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml | 7940 ++++--- 6 files changed, 35988 insertions(+), 26207 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index 82d82242e58..16ef2cdbe0e 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -3159,243 +3159,6 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AdaptiveGemm: 0 - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x64_MI32x3i3wgIkgjfqD9uXc_vVgFeJgfaL8Y6k8yzx4nmpWsnuI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 - LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 24576 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 0 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 16 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 96 - MacroTileA: 32 - MacroTileB: 96 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -3558,7 +3321,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 14 + SolutionIndex: 13 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -3795,7 +3558,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 15 + SolutionIndex: 14 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -4032,7 +3795,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 16 + SolutionIndex: 15 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -4269,7 +4032,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 17 + SolutionIndex: 16 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -4506,7 +4269,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 18 + SolutionIndex: 17 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB7_NTC0_NTD6_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -4743,7 +4506,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 19 + SolutionIndex: 18 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -4980,7 +4743,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 20 + SolutionIndex: 19 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -5055,7 +4818,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -5067,249 +4830,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x64x32_MI16xwgOGIlbHSY-6NN6XLdmF7L-EWnBxY0nUWipOv7yd_Fk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1P79SndavTGvem3QTcxe5avntrlbrnZffKcV66eEHhVU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: true - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA6_NTB4_NTC0_NTD5_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 - LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 35840 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 35840 - LdsOffsetB_Blk: 101376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 2] - MIWaveTileA: 7 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 64 - MacroTileA: 224 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 5 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 7 - NumLoadsB: 4 - NumLoadsCoalescedA: 7 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA6_NTB4_NTC0_NTD5_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 2 - ThreadTileA: 28 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AdaptiveGemm: 0 - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1P79SndavTGvem3QTcxe5avntrlbrnZffKcV66eEHhVU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -5454,7 +4980,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 22 + SolutionIndex: 20 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB5_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -5691,7 +5217,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 23 + SolutionIndex: 21 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -5928,7 +5454,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 24 + SolutionIndex: 22 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -6165,7 +5691,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 25 + SolutionIndex: 23 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -6402,7 +5928,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 26 + SolutionIndex: 24 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -6639,7 +6165,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 27 + SolutionIndex: 25 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -6876,7 +6402,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 28 + SolutionIndex: 26 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -7113,7 +6639,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 29 + SolutionIndex: 27 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -7350,7 +6876,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 30 + SolutionIndex: 28 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -7437,7 +6963,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xS8BemcgDmhBdB4lISWaJd3al9jSpienby5xNB2TIiUs= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xDLKsQQPRrn0n5Vwsb2N_XxY3gRwpEsX5MH_upKXcePc= BufferLoad: true BufferStore: true CUCount: null @@ -7468,7 +6994,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -7480,36 +7006,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 @@ -7520,7 +7046,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7529,14 +7055,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7556,7 +7082,7 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 5 NonTemporalB: 7 NonTemporalC: 1 NonTemporalD: 3 @@ -7564,14 +7090,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 14 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7588,22 +7114,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -7611,16 +7137,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -7633,8 +7159,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -7642,7 +7168,7 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7677,20 +7203,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32MrNnY6-q5-x--_2qLKFIhdkT-VfmsesFKPD_9pFBJKE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xBlD6lvzcO3AsZHEkbsuV-5HK3LQYPUSYdge36Zz-Tkk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7720,47 +7246,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7768,15 +7294,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7790,27 +7316,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 4 NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -7828,33 +7354,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7870,26 +7396,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -7898,13 +7424,13 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -7917,7 +7443,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xDLKsQQPRrn0n5Vwsb2N_XxY3gRwpEsX5MH_upKXcePc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x64x32_MI32x6vVSC6SDE2KOmLmTsSdREVzn1S-j_c1qld7cPM9ruFo= BufferLoad: true BufferStore: true CUCount: null @@ -7927,10 +7453,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7960,47 +7486,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 256 LSCB: 64 - LSPA: 16 + LSPA: 4 LSPB: 16 - LVCA: 16 + LVCA: 64 LVCB: 16 - LVPA: 4 + LVPA: 1 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 99840 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 99840 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -8008,14 +7534,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [4, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 256 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 256 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -8030,28 +7556,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8068,25 +7594,25 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 33 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -8095,12 +7621,12 @@ ThreadTile1: 2 ThreadTileA: 32 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -8110,7 +7636,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -8120,18 +7646,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -8143,8 +7669,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -8157,7 +7683,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xBlD6lvzcO3AsZHEkbsuV-5HK3LQYPUSYdge36Zz-Tkk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32m8hH9VbP1rsnhmm7x1C7xvs1_vLIeT1cHnec5pXHId8= BufferLoad: true BufferStore: true CUCount: null @@ -8167,7 +7693,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -8182,16 +7708,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -8200,39 +7726,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 256 + LSCB: 128 + LSPA: 1 + LSPB: 2 + LVCA: 256 + LVCB: 128 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -8240,7 +7766,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -8248,15 +7774,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8276,22 +7802,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8308,39 +7834,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 34 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -8353,29 +7879,29 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -8397,7 +7923,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x64x32_MI32x6vVSC6SDE2KOmLmTsSdREVzn1S-j_c1qld7cPM9ruFo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32P7qsJIhrv78XpyeSC7zBiVxn1KY_eJp5nKJBC0jo2gQ= BufferLoad: true BufferStore: true CUCount: null @@ -8440,34 +7966,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 1 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -8488,15 +8014,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] + MIWaveGroup: [2, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 64 - MacroTileA: 256 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8517,21 +8043,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalB: 2 NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 + NumElementsPerBatchStore: 12 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8548,14 +8074,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 35 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -8563,11 +8089,11 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -8590,7 +8116,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -8600,9 +8126,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8611,15 +8137,15 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -8637,12 +8163,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32m8hH9VbP1rsnhmm7x1C7xvs1_vLIeT1cHnec5pXHId8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32dcnL-xT6QilVKSAVmeQtnX8jMTNwUHqUlw-b5wTn7Vg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -8662,15 +8188,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -8680,34 +8206,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 256 - LSCB: 128 - LSPA: 1 - LSPB: 2 - LVCA: 256 - LVCB: 128 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 256 + LSPA: 4 + LSPB: 1 + LVCA: 64 + LVCB: 256 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 116736 LdsInitCVgprs: false LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -8729,14 +8255,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8756,22 +8282,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 + NonTemporalA: 4 + NonTemporalB: 1 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 4 NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 16 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 32 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8788,18 +8314,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 36 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -8811,10 +8337,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8833,15 +8359,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 + WorkGroupMapping: 32 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -8851,15 +8377,15 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -8877,7 +8403,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xngs_elNKDW6m1ocFurAVW1-bILkxN-GRvv7ATNlFbNA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32lCuOSl64I7neVp6btzXlQuLUgdhwEhhlghPKHMS9l-A= BufferLoad: true BufferStore: true CUCount: null @@ -8887,17 +8413,17 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -8908,7 +8434,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -8920,47 +8446,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -8969,14 +8495,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8990,22 +8516,22 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -9019,7 +8545,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -9028,22 +8554,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 37 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -9051,16 +8577,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -9070,26 +8596,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -9103,8 +8629,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -9117,7 +8643,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32P7qsJIhrv78XpyeSC7zBiVxn1KY_eJp5nKJBC0jo2gQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI327T9q44lPTbgQhD9P5Re1DTYhmb-kIDLWiLzzNZahKdk= BufferLoad: true BufferStore: true CUCount: null @@ -9128,9 +8654,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9142,16 +8668,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -9160,47 +8686,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 256 + LSPA: 4 + LSPB: 1 + LVCA: 64 + LVCB: 256 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -9209,14 +8735,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9230,28 +8756,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 24 + NumLoadsB: 32 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 32 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9268,22 +8794,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 38 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -9291,16 +8817,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -9310,11 +8836,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -9322,7 +8848,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9331,21 +8857,21 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -9357,7 +8883,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32dcnL-xT6QilVKSAVmeQtnX8jMTNwUHqUlw-b5wTn7Vg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI32Osu4QqVfcn5r6pIbEerf6Jrxm2K689E53kraS9QV9zc= BufferLoad: true BufferStore: true CUCount: null @@ -9383,7 +8909,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -9391,7 +8917,7 @@ GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -9400,34 +8926,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 64 LSCB: 256 - LSPA: 4 - LSPB: 1 - LVCA: 64 - LVCB: 256 - LVPA: 2 + LSPA: 8 + LSPB: 2 + LVCA: 32 + LVCB: 128 + LVPA: 4 LVPB: 1 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 60928 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -9448,14 +8974,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 192 MacroTile1: 256 - MacroTileA: 128 + MacroTileA: 192 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -9476,22 +9002,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 0 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 1 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 12 + NumLoadsB: 16 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9508,11 +9034,11 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: 1 @@ -9524,17 +9050,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 96 + ThreadTile1: 2 + ThreadTileA: 96 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9554,15 +9080,15 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9597,7 +9123,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32lCuOSl64I7neVp6btzXlQuLUgdhwEhhlghPKHMS9l-A= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32bI_bdkvPgr0hxWR52yzRVG-SV6EeZroAMXfU2LBV6Cc= BufferLoad: true BufferStore: true CUCount: null @@ -9623,7 +9149,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -9640,34 +9166,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 256 + LSCA: 128 LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 + LSPA: 8 + LSPB: 2 + LVCA: 32 + LVCB: 128 + LVPA: 2 LVPB: 1 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 117248 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -9688,14 +9214,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 128 MacroTile1: 256 - MacroTileA: 256 + MacroTileA: 128 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -9716,22 +9242,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9748,33 +9274,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 64 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 64 - ThreadTileB: 4 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9790,19 +9316,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9837,7 +9363,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI327T9q44lPTbgQhD9P5Re1DTYhmb-kIDLWiLzzNZahKdk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI32hi5QaXYYbTLfVo4WYFrf6BFXaTAmKT1q2UzDXxqfsTc= BufferLoad: true BufferStore: true CUCount: null @@ -9862,16 +9388,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -9880,34 +9406,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 32 LSCB: 256 - LSPA: 4 - LSPB: 1 - LVCA: 64 - LVCB: 256 - LVPA: 4 + LSPA: 16 + LSPB: 2 + LVCA: 16 + LVCB: 128 + LVPA: 8 LVPB: 1 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 123392 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -9928,14 +9454,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 + MacroTile0: 160 MacroTile1: 256 - MacroTileA: 192 + MacroTileA: 160 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -9957,21 +9483,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 192 - NumLoadsA: 24 - NumLoadsB: 32 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 10 + NumLoadsB: 16 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9988,14 +9514,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -10004,17 +9530,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 + ThreadTile0: 80 + ThreadTile1: 2 + ThreadTileA: 80 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10034,13 +9560,13 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 @@ -10055,7 +9581,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -10065,7 +9591,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -10077,7 +9603,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI32Osu4QqVfcn5r6pIbEerf6Jrxm2K689E53kraS9QV9zc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32rvSVHgGyc9O8bCX7KPSzyfhaYox1wvz-y6e2df6qooI= BufferLoad: true BufferStore: true CUCount: null @@ -10120,34 +9646,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 256 - LSPA: 8 - LSPB: 2 - LVCA: 32 - LVCB: 128 - LVPA: 4 - LVPB: 1 + LSCA: 256 + LSCB: 128 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 1 + LVPB: 2 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 + LdsBytesNoAmax: 117248 LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 91648 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -10168,15 +9694,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10196,22 +9722,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 - NumLoadsB: 16 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10228,13 +9754,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -10243,18 +9769,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 2 - ThreadTileA: 96 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10274,15 +9800,15 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10295,7 +9821,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -10317,7 +9843,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32bI_bdkvPgr0hxWR52yzRVG-SV6EeZroAMXfU2LBV6Cc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xZDgg0HrpoVaRxARF92LojQtAHhrsjP4H0KyqEhH-sCk= BufferLoad: true BufferStore: true CUCount: null @@ -10327,10 +9853,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10342,15 +9868,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10360,47 +9886,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 128 - LSCB: 256 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 2 + LSPB: 32 LVCA: 32 - LVCB: 128 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -10408,15 +9934,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10430,28 +9956,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 6 + NonTemporalA: 5 + NonTemporalB: 4 + NonTemporalC: 3 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10468,39 +9994,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10510,28 +10036,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10543,8 +10069,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -10557,7 +10083,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI32hi5QaXYYbTLfVo4WYFrf6BFXaTAmKT1q2UzDXxqfsTc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xT_oWldoREPLcWaxzrswuGqxafTJ0Dx8ymCrTRmXFnI8= BufferLoad: true BufferStore: true CUCount: null @@ -10567,10 +10093,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10582,15 +10108,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10600,47 +10126,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 LSCA: 32 - LSCB: 256 - LSPA: 16 - LSPB: 2 - LVCA: 16 - LVCB: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 8 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -10648,15 +10174,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10670,28 +10196,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 7 + NonTemporalA: 7 + NonTemporalB: 5 + NonTemporalC: 1 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 10 - NumLoadsB: 16 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10708,14 +10234,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -10723,24 +10249,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 2 - ThreadTileA: 80 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10750,41 +10276,41 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -10797,7 +10323,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32rvSVHgGyc9O8bCX7KPSzyfhaYox1wvz-y6e2df6qooI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3TwUtfedIksh229b4jL_RGCbgCNCFvRURgDStOE3W9l4= BufferLoad: true BufferStore: true CUCount: null @@ -10807,10 +10333,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10822,8 +10348,8 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -10840,45 +10366,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 256 - LSCB: 128 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -10888,15 +10414,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10910,28 +10436,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 7 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 16 - NumLoadsB: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10940,7 +10466,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10948,39 +10474,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM2_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 + StreamKXCCMapping: 8 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10990,32 +10516,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -11023,8 +10549,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -11037,7 +10563,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xfjyEPyKcLju337expcYgdW_PYDBxtV--GwqFuufDIHQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xEv6q-2RrREhV2X2V16Q3urIUlLubt1R_nl9HzjW-1Vo= BufferLoad: true BufferStore: true CUCount: null @@ -11062,15 +10588,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -11080,34 +10606,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -11120,7 +10646,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -11129,13 +10655,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -11156,21 +10682,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 6 - NonTemporalC: 1 + NonTemporalA: 7 + NonTemporalB: 5 + NonTemporalC: 2 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -11188,22 +10714,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 46 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -11211,9 +10737,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -11230,10 +10756,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -11251,15 +10777,15 @@ _DepthUB: 128 _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -11277,7 +10803,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xZDgg0HrpoVaRxARF92LojQtAHhrsjP4H0KyqEhH-sCk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x3NlLSXz4MTW4u3ogcgbK2QAMEnVRZJPbsZXS-xRtf7U= BufferLoad: true BufferStore: true CUCount: null @@ -11287,10 +10813,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -11302,7 +10828,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -11310,7 +10836,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -11320,48 +10846,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -11373,44 +10899,44 @@ MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 4 + NonTemporalA: 3 + NonTemporalB: 2 NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -11428,13 +10954,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 47 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -11444,16 +10970,16 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -11480,16 +11006,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -11503,8 +11029,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -11517,7 +11043,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xT_oWldoREPLcWaxzrswuGqxafTJ0Dx8ymCrTRmXFnI8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3v3FiLk7zthz6BecsfG_ytiGrAKMvyDv3YQNVHChxGm8= BufferLoad: true BufferStore: true CUCount: null @@ -11527,7 +11053,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -11542,16 +11068,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -11560,34 +11086,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -11597,8 +11123,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -11636,22 +11162,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 1 + NonTemporalA: 6 + NonTemporalB: 6 + NonTemporalC: 6 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 + NumElementsPerBatchStore: 16 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11660,7 +11186,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11668,14 +11194,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 48 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -11683,7 +11209,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -11710,7 +11236,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -11722,24 +11248,24 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -11757,7 +11283,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3TwUtfedIksh229b4jL_RGCbgCNCFvRURgDStOE3W9l4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3ztAWOuJIg4hj-4ujEuGL1U8M4eXirObR_poPSQp0OOM= BufferLoad: true BufferStore: true CUCount: null @@ -11782,16 +11308,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -11800,34 +11326,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -11849,13 +11375,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -11876,22 +11402,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 7 - NonTemporalC: 5 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 3 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11908,22 +11434,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 49 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM2_WGMXCC16_WGMXCCGn1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -11931,9 +11457,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -11953,7 +11479,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -11961,8 +11487,8 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11978,8 +11504,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -11997,7 +11523,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xEv6q-2RrREhV2X2V16Q3urIUlLubt1R_nl9HzjW-1Vo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kOSFPyfEiXFMwNZMGIhbzH01Yc6BxtbPYDwewDUScr8= BufferLoad: true BufferStore: true CUCount: null @@ -12007,7 +11533,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -12040,7 +11566,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 LSCB: 32 @@ -12053,21 +11579,21 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -12077,10 +11603,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12116,22 +11642,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 + NumElementsPerBatchStore: 14 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12140,7 +11666,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12148,17 +11674,17 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 50 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 @@ -12190,7 +11716,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -12206,12 +11732,12 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -12237,12 +11763,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x3NlLSXz4MTW4u3ogcgbK2QAMEnVRZJPbsZXS-xRtf7U= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xw4RgIcqBRFqEmLFOO3XP4aJt222Ys1eV6bag4UczOws= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -12280,8 +11806,8 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 LSCA: 16 LSCB: 16 LSPA: 64 @@ -12357,13 +11883,13 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 3 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 + NumElementsPerBatchStore: 8 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 @@ -12388,8 +11914,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 51 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -12403,7 +11929,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -12458,8 +11984,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -12477,7 +12003,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kOSFPyfEiXFMwNZMGIhbzH01Yc6BxtbPYDwewDUScr8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3syvkIi8LYYEv3DFYiM1Iz5gfUUiF9htX1js2hwjk18E= BufferLoad: true BufferStore: true CUCount: null @@ -12503,15 +12029,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -12520,34 +12046,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -12569,13 +12095,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -12596,22 +12122,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 5 + NonTemporalB: 7 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12628,18 +12154,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 52 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -12651,9 +12177,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -12670,10 +12196,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -12691,7 +12217,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -12717,20 +12243,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xw4RgIcqBRFqEmLFOO3XP4aJt222Ys1eV6bag4UczOws= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3fJnt75929EZ_aqQCP_MCPyiNiz8LyKpNQWmHPNmxA6E= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12742,16 +12268,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -12760,48 +12286,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 - LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -12813,45 +12339,45 @@ MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12860,7 +12386,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12868,32 +12394,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 53 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -12920,16 +12446,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -12943,8 +12469,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -12957,7 +12483,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3syvkIi8LYYEv3DFYiM1Iz5gfUUiF9htX1js2hwjk18E= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kO-rrc7SP8abCiazbzjpjv970HJ6dMdiUNvfFInMaUg= BufferLoad: true BufferStore: true CUCount: null @@ -12983,15 +12509,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -13000,34 +12526,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -13049,13 +12575,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -13078,20 +12604,20 @@ NonTemporal: -1 NonTemporalA: 5 NonTemporalB: 7 - NonTemporalC: 4 + NonTemporalC: 6 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13108,18 +12634,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 54 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -13131,9 +12657,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -13150,10 +12676,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -13161,8 +12687,8 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -13171,11 +12697,11 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -13197,7 +12723,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3fJnt75929EZ_aqQCP_MCPyiNiz8LyKpNQWmHPNmxA6E= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x7Ue9K6K2Ntf9cWH31tBpMfG-dUbDAAZYidjEukyBkJ8= BufferLoad: true BufferStore: true CUCount: null @@ -13207,10 +12733,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13222,16 +12748,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -13240,48 +12766,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -13293,45 +12819,45 @@ MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 3 NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13340,7 +12866,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13348,32 +12874,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 55 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -13390,7 +12916,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -13400,31 +12926,31 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -13437,7 +12963,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kO-rrc7SP8abCiazbzjpjv970HJ6dMdiUNvfFInMaUg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3-Y1s2goE5bt3ELzlBLdYEEGXo4CN58GBD1CcuhssmnI= BufferLoad: true BufferStore: true CUCount: null @@ -13480,8 +13006,8 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 LSCA: 64 LSCB: 32 LSPA: 16 @@ -13520,7 +13046,7 @@ LoopIters: 1 LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13556,14 +13082,14 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 + NonTemporalA: 7 + NonTemporalB: 5 NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 + NumElementsPerBatchStore: 16 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 @@ -13588,14 +13114,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 56 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC8_WGMXCCGn1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 @@ -13603,7 +13129,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -13630,7 +13156,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -13641,8 +13167,8 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -13655,11 +13181,11 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -13677,7 +13203,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x7Ue9K6K2Ntf9cWH31tBpMfG-dUbDAAZYidjEukyBkJ8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3jGU7eBWYn2uq06g9WNdz2rJrlsiV08zOFEkl_DheAO8= BufferLoad: true BufferStore: true CUCount: null @@ -13687,10 +13213,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13702,16 +13228,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -13720,48 +13246,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -13773,46 +13299,46 @@ MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 0 NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalC: 2 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13820,7 +13346,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13828,14 +13354,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 57 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -13844,16 +13370,16 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -13870,7 +13396,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -13880,31 +13406,31 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 2, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -13917,7 +13443,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3-Y1s2goE5bt3ELzlBLdYEEGXo4CN58GBD1CcuhssmnI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xNwlw2eRLckxkCN-CUwNSdJhWdS3ncTyQtpMiviPnWD0= BufferLoad: true BufferStore: true CUCount: null @@ -13927,10 +13453,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13948,7 +13474,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -13960,48 +13486,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -14009,49 +13535,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14060,7 +13586,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14068,32 +13594,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 58 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -14110,41 +13636,41 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -14157,7 +13683,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3jGU7eBWYn2uq06g9WNdz2rJrlsiV08zOFEkl_DheAO8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x128_MI16xMeVY6M4rG08B1L31ecGGxHsYBMoyD7ClC_9Q_YvLxrE= BufferLoad: true BufferStore: true CUCount: null @@ -14167,7 +13693,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -14182,16 +13708,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -14200,48 +13726,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -14249,23 +13775,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14276,23 +13802,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 + NonTemporalA: 6 + NonTemporalB: 1 NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14308,32 +13834,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 59 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -14353,23 +13879,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -14378,8 +13904,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -14397,7 +13923,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xNwlw2eRLckxkCN-CUwNSdJhWdS3ncTyQtpMiviPnWD0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3QNX14jqPW75Zz3vlZTY-T_2lUZsf-R8ZcuULerVLK3c= BufferLoad: true BufferStore: true CUCount: null @@ -14407,10 +13933,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -14422,7 +13948,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -14430,7 +13956,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -14440,97 +13966,97 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [1, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 4 + NonTemporalA: 6 + NonTemporalB: 5 + NonTemporalC: 7 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -14548,13 +14074,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 60 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -14563,17 +14089,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -14590,7 +14116,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -14600,18 +14126,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -14623,8 +14149,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -14637,7 +14163,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x128_MI16xMeVY6M4rG08B1L31ecGGxHsYBMoyD7ClC_9Q_YvLxrE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x32AltWgSZdqdwx7GzSAOhOKSF6ItaUmcsthBASNoxcv0= BufferLoad: true BufferStore: true CUCount: null @@ -14647,7 +14173,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -14668,7 +14194,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -14680,72 +14206,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 LSCA: 32 - LSCB: 16 + LSCB: 64 LSPA: 32 - LSPB: 64 + LSPB: 16 LVCA: 8 - LVCB: 4 + LVCB: 16 LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 57344 LdsInitCVgprs: false LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14756,22 +14282,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 2 + NonTemporalA: 4 + NonTemporalB: 7 + NonTemporalC: 3 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14780,7 +14306,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14788,8 +14314,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 61 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -14799,21 +14325,21 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -14830,36 +14356,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -14877,20 +14403,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3QNX14jqPW75Zz3vlZTY-T_2lUZsf-R8ZcuULerVLK3c= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x128_MI16xJ4y05CuBpGc3taFnt7pzYk0hSNiMOEMi0fG9NiHuLBg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -14902,15 +14428,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -14920,98 +14446,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 124928 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 124928 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 6 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 3 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 + NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15028,39 +14554,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 62 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM24_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + StreamKXCCMapping: 6 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -15070,41 +14596,41 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -15117,7 +14643,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x32AltWgSZdqdwx7GzSAOhOKSF6ItaUmcsthBASNoxcv0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3ujNlCct7p7sebpSjxt4sZn01WDpJaiKS9omFcrPkkDA= BufferLoad: true BufferStore: true CUCount: null @@ -15142,15 +14668,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -15160,45 +14686,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 LSCB: 64 - LSPA: 32 + LSPA: 4 LSPB: 16 - LVCA: 8 + LVCA: 64 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -15208,14 +14734,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -15236,21 +14762,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -15260,7 +14786,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15268,8 +14794,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 63 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -15279,22 +14805,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 2 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15313,16 +14839,16 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -15338,8 +14864,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -15357,20 +14883,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x128_MI16xJ4y05CuBpGc3taFnt7pzYk0hSNiMOEMi0fG9NiHuLBg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3nzu6Qyih_cejTkfbKQ9cp0TJkTsESgoR9kXX96zNFYU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -15382,15 +14908,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -15400,48 +14926,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 16 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124928 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 124928 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 - LoopUnroll: 32 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -15449,49 +14975,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 48 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 6 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 0 + NonTemporalB: 6 + NonTemporalC: 5 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 8 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15500,7 +15026,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15508,39 +15034,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 64 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM24_WGMXCC8_WGMXCCGn1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -15550,41 +15076,41 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -15597,7 +15123,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3ujNlCct7p7sebpSjxt4sZn01WDpJaiKS9omFcrPkkDA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xO4XNUjHGrHqgODUpLv423LWJHZo8XNmE2QQyUHs4Hp4= BufferLoad: true BufferStore: true CUCount: null @@ -15607,10 +15133,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -15622,15 +15148,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -15640,48 +15166,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -15689,48 +15215,48 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -15740,7 +15266,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15748,33 +15274,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 65 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15790,26 +15316,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -15823,8 +15349,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -15837,7 +15363,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3nzu6Qyih_cejTkfbKQ9cp0TJkTsESgoR9kXX96zNFYU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3ZSMvGjIMLw0FMc1VgvEQMnZXoXvLulLF3Zm4sqeDSZY= BufferLoad: true BufferStore: true CUCount: null @@ -15847,7 +15373,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -15863,15 +15389,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -15880,39 +15406,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSPB: 4 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -15929,14 +15455,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15956,23 +15482,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 7 NonTemporalB: 6 - NonTemporalC: 5 + NonTemporalC: 0 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15988,22 +15514,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 66 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -16011,10 +15537,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16030,28 +15556,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 2, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16077,7 +15603,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xO4XNUjHGrHqgODUpLv423LWJHZo8XNmE2QQyUHs4Hp4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Kw-SZQAtmHpopscjyAUHOSiylfaQV66yfpX_pR5KxNc= BufferLoad: true BufferStore: true CUCount: null @@ -16087,10 +15613,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -16102,15 +15628,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -16120,48 +15646,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -16169,48 +15695,48 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalA: 4 + NonTemporalB: 6 + NonTemporalC: 7 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -16220,7 +15746,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16228,33 +15754,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 67 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16270,26 +15796,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -16303,8 +15829,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -16317,7 +15843,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3ZSMvGjIMLw0FMc1VgvEQMnZXoXvLulLF3Zm4sqeDSZY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI32x3LhbYlim10ZqOk0nP8fXHOi_Hi7y_lAu7NG8aWiU96lc= BufferLoad: true BufferStore: true CUCount: null @@ -16360,34 +15886,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 + LSCA: 64 LSCB: 32 LSPA: 4 - LSPB: 4 - LVCA: 32 + LSPB: 8 + LVCA: 64 LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumBytes: 28672 + LdsNumElementsAlignedA: 8192 LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -16400,7 +15926,7 @@ LoopIters: 1 LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16408,14 +15934,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -16436,23 +15962,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalA: 5 + NonTemporalB: 5 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 16 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -16468,13 +15994,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 68 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -16484,9 +16010,9 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -16520,7 +16046,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 2] + WorkGroup: [64, 2, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 @@ -16538,14 +16064,14 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -16557,7 +16083,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Kw-SZQAtmHpopscjyAUHOSiylfaQV66yfpX_pR5KxNc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI327SkXZTgVHkX5LyMuBLgxMYBdZ66v8HJttMYA7hEwXj0= BufferLoad: true BufferStore: true CUCount: null @@ -16567,10 +16093,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -16583,15 +16109,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -16600,47 +16126,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 256 LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 + LSPA: 1 + LSPB: 4 + LVCA: 256 + LVCB: 64 + LVPA: 1 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16648,15 +16174,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16670,28 +16196,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 1 NonTemporalB: 6 - NonTemporalC: 7 + NonTemporalC: 6 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 32 + NumLoadsB: 24 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16700,7 +16226,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16708,39 +16234,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 69 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -16750,28 +16276,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16783,8 +16309,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -16797,7 +16323,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI32x3LhbYlim10ZqOk0nP8fXHOi_Hi7y_lAu7NG8aWiU96lc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xfBh1DNfo1fawGWm7X3qaY_5Squ83EfDfOEAKMCzM9Rk= BufferLoad: true BufferStore: true CUCount: null @@ -16807,7 +16333,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -16823,15 +16349,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -16840,45 +16366,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -16888,15 +16414,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16917,21 +16443,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 5 - NonTemporalB: 5 - NonTemporalC: 5 + NonTemporalB: 1 + NonTemporalC: 6 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16940,7 +16466,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16948,33 +16474,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 70 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 + StreamKXCCMapping: 5 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16993,23 +16519,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -17025,7 +16551,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -17037,17 +16563,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI327SkXZTgVHkX5LyMuBLgxMYBdZ66v8HJttMYA7hEwXj0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xcTh1OosyONIB0_BR79JtowSiOCUl1eOmBo7W6kJ1IEI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -17062,16 +16588,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -17080,72 +16606,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 64 - LSPA: 1 - LSPB: 4 - LVCA: 256 - LVCB: 64 - LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -17157,21 +16683,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 32 - NumLoadsB: 24 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17188,39 +16714,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 71 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -17230,36 +16756,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -17277,7 +16803,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xfBh1DNfo1fawGWm7X3qaY_5Squ83EfDfOEAKMCzM9Rk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32Gf-ZOVIhDoAfhdLX7QuucetgGovzU9cyt52o0H5akKE= BufferLoad: true BufferStore: true CUCount: null @@ -17287,10 +16813,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -17302,15 +16828,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -17320,43 +16846,43 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 128 LSPA: 4 - LSPB: 16 + LSPB: 4 LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVCB: 64 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -17368,15 +16894,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17390,27 +16916,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 6 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 2 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 32 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -17428,39 +16954,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 72 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC16_WGMXCCGn1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 6 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 64 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 64 ThreadTileB: 2 - TransposeLDS: 0 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -17470,41 +16996,41 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 48 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -17517,20 +17043,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32eYnwK6Kei8GLtUjTz4SwAJN4sTgnevpcBqDgn7APuNc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT352x160x32_MI16ccTYseDAGn1oJXLhoZQThQNde5MmK0F8g1oeV6Ugr6Y= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -17548,7 +17074,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -17560,48 +17086,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 5632 + LdsBlockSizePerPadB: 2560 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 45568 + LdsNumElementsAlignedB: 20992 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 112128 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 112128 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -17609,49 +17135,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [11, 5] + MIWaveTileA: 11 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 352 + MacroTile1: 160 + MacroTileA: 352 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalC: 3 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 220 + NumGlobalWriteVectorsPerThread: 220 + NumLoadsA: 11 + NumLoadsB: 5 + NumLoadsCoalescedA: 11 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17660,7 +17186,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17668,33 +17194,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 73 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 44 + ThreadTile1: 5 + ThreadTileA: 44 + ThreadTileB: 5 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17710,19 +17236,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -17731,11 +17257,11 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -17743,8 +17269,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -17757,7 +17283,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xcTh1OosyONIB0_BR79JtowSiOCUl1eOmBo7W6kJ1IEI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x96x32_MI16x1LpcRZaaV-Af8k2PL8BZusz8LZyRttzVO0p6xf9capGw= BufferLoad: true BufferStore: true CUCount: null @@ -17767,7 +17293,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -17800,47 +17326,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1536 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 58368 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 58368 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 12800 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 45568 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 45568 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17848,15 +17374,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17876,22 +17402,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 2 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17908,14 +17434,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 74 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -17923,18 +17449,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17960,22 +17486,22 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -17997,7 +17523,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32Gf-ZOVIhDoAfhdLX7QuucetgGovzU9cyt52o0H5akKE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32sjq3VlWgDexdtB30zdaxZHZ2177W8NQHz9MJk1SRjvM= BufferLoad: true BufferStore: true CUCount: null @@ -18008,9 +17534,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18023,12 +17549,12 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -18040,43 +17566,43 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 256 + LSCA: 128 LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 - LdsNumElementsAlignedMetadata: 0 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -18089,13 +17615,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 256 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -18110,28 +17636,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 2 + NonTemporalB: 2 + NonTemporalC: 4 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 128 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18148,22 +17674,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 75 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 + StreamKXCCMapping: 4 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -18171,16 +17697,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 64 + ThreadTileA: 32 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -18193,7 +17719,7 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -18202,7 +17728,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 6 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -18215,7 +17741,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -18223,8 +17749,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -18237,20 +17763,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT352x160x32_MI16ccTYseDAGn1oJXLhoZQThQNde5MmK0F8g1oeV6Ugr6Y= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1NQx2dS7HxM2pbst2fnrvQsmNkjm_6CZ0Yyr4B0RyrYE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18280,47 +17806,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 5632 - LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 45568 - LdsNumElementsAlignedB: 20992 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 45568 - LdsOffsetB_Blk: 112128 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45568 - LdsOffsetMetadata_Blk: 112128 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18328,15 +17854,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [11, 5] - MIWaveTileA: 11 - MIWaveTileB: 5 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 352 - MacroTile1: 160 - MacroTileA: 352 - MacroTileB: 160 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18350,28 +17876,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalA: 7 + NonTemporalB: 3 NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 220 - NumGlobalWriteVectorsPerThread: 220 - NumLoadsA: 11 - NumLoadsB: 5 - NumLoadsCoalescedA: 11 - NumLoadsCoalescedB: 5 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18380,7 +17906,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18388,33 +17914,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 76 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 44 - ThreadTile1: 5 - ThreadTileA: 44 - ThreadTileB: 5 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18434,28 +17960,28 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -18463,8 +17989,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -18477,20 +18003,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x96x32_MI16x1LpcRZaaV-Af8k2PL8BZusz8LZyRttzVO0p6xf9capGw= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3GzPpMCCg6eosyo0J9SwQ5vW0imarFZnMlHiOChdob7M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18502,15 +18028,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -18520,98 +18046,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1536 - LdsBlockSizePerPadB: 1536 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58368 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 58368 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 12800 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 45568 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 45568 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 - LoopUnroll: 32 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 3 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 2 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18620,7 +18146,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18628,33 +18154,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 77 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 + StreamKXCCMapping: 0 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18670,32 +18196,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -18703,8 +18229,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -18717,7 +18243,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1NQx2dS7HxM2pbst2fnrvQsmNkjm_6CZ0Yyr4B0RyrYE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3NSZxol_RK3ulhtjKNVM4wmncpvgvfydSWBZJx3QPOQ4= BufferLoad: true BufferStore: true CUCount: null @@ -18742,7 +18268,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -18750,7 +18276,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -18760,24 +18286,24 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57344 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -18797,35 +18323,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -18836,22 +18362,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18868,14 +18394,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 78 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -18883,18 +18409,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18910,11 +18436,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -18922,7 +18448,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -18931,7 +18457,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -18957,7 +18483,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3GzPpMCCg6eosyo0J9SwQ5vW0imarFZnMlHiOChdob7M= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32st4uOxQc9vEhWZhR3gpfTesG3QAsHuiqHPy0f-iqroE= BufferLoad: true BufferStore: true CUCount: null @@ -18967,10 +18493,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18982,7 +18508,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -18990,7 +18516,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -19000,47 +18526,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 256 LSPA: 4 - LSPB: 16 + LSPB: 4 LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19048,15 +18574,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19070,28 +18596,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 3 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 7 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19100,7 +18626,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19108,14 +18634,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 79 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -19123,24 +18649,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -19150,26 +18676,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -19178,13 +18704,13 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -19197,7 +18723,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3NSZxol_RK3ulhtjKNVM4wmncpvgvfydSWBZJx3QPOQ4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI328rtlOkIRB2U8sv7gASwl7nDep88FFUev28ZzQZ-C8MY= BufferLoad: true BufferStore: true CUCount: null @@ -19207,10 +18733,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -19222,15 +18748,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -19240,47 +18766,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 256 + LSCB: 256 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19289,14 +18815,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19310,28 +18836,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 16 - NumLoadsB: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19348,22 +18874,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 80 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -19371,16 +18897,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -19393,8 +18919,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -19402,14 +18928,14 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -19423,9 +18949,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -19437,12 +18963,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32st4uOxQc9vEhWZhR3gpfTesG3QAsHuiqHPy0f-iqroE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1G3yMdb7BIPmH0-pBdSqMzeJXAKcONTAp7QuSBNzCeH4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -19462,13 +18988,13 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -19480,36 +19006,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 9728 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 9728 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -19517,35 +19043,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -19556,23 +19082,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 5 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -19580,7 +19106,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19588,33 +19114,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 81 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19630,17 +19156,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 @@ -19665,7 +19191,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -19677,12 +19203,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI328rtlOkIRB2U8sv7gASwl7nDep88FFUev28ZzQZ-C8MY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1d95FnwSepCpQ1VNN-c9bIuWmSq8NCHH4Yv9ZeAeUB5U= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -19702,13 +19228,13 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -19720,36 +19246,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 1 - LVPB: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 8192 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -19757,35 +19283,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -19796,23 +19322,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 16 - NumLoadsB: 16 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -19828,39 +19354,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 82 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -19873,16 +19399,16 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -19898,14 +19424,14 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -19917,7 +19443,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1G3yMdb7BIPmH0-pBdSqMzeJXAKcONTAp7QuSBNzCeH4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1vdET2FEVxtnfxZJASjE4Mpu0qbCqFDyzkii6x1dbHPs= BufferLoad: true BufferStore: true CUCount: null @@ -19937,20 +19463,20 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -19960,36 +19486,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: 0 LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9728 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 9728 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9728 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -20000,7 +19526,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20008,15 +19534,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20036,22 +19562,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 5 + NonTemporalC: 6 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20059,7 +19585,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -20068,39 +19594,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 83 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -20110,19 +19636,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -20157,7 +19683,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1d95FnwSepCpQ1VNN-c9bIuWmSq8NCHH4Yv9ZeAeUB5U= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1XVsYbgnD3R965KnpJIJjvIue0CI2SvPxRYsIPhAomzo= BufferLoad: true BufferStore: true CUCount: null @@ -20182,16 +19708,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -20200,36 +19726,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 LSCA: 32 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 8192 + LdsBytesNoAmax: 12800 LdsInitCVgprs: false - LdsNumBytes: 8192 + LdsNumBytes: 12800 LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 16384 LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -20248,15 +19774,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20276,23 +19802,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 + NonTemporalA: 1 + NonTemporalB: 6 NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20308,14 +19834,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 84 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -20325,16 +19851,16 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20350,19 +19876,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -20371,7 +19897,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -20385,7 +19911,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -20397,7 +19923,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1vdET2FEVxtnfxZJASjE4Mpu0qbCqFDyzkii6x1dbHPs= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xk1pA2wRYcqyCR46mTdGjDlhdTU15it1Jh10auM8T_po= BufferLoad: true BufferStore: true CUCount: null @@ -20428,7 +19954,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: false @@ -20440,34 +19966,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 + LSCA: 64 + LSCB: 256 LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 4096 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 73728 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -20488,15 +20014,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20516,23 +20042,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20548,8 +20074,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 85 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -20559,22 +20085,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20590,17 +20116,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -20611,7 +20137,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -20625,7 +20151,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -20637,20 +20163,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1XVsYbgnD3R965KnpJIJjvIue0CI2SvPxRYsIPhAomzo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xWktQ7pkAc4eXErbVdZeJISaskwk40DfNVmN4lX1TIVk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20662,16 +20188,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -20680,99 +20206,99 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 LSPA: 8 LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 + NonTemporalA: 4 + NonTemporalB: 4 NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20788,14 +20314,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 86 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -20803,18 +20329,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 + StreamKXCCMapping: 6 + SubGroup0: 2 SubGroup1: 64 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20830,19 +20356,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -20851,23 +20377,23 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -20877,40 +20403,40 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xk1pA2wRYcqyCR46mTdGjDlhdTU15it1Jh10auM8T_po= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16x_DAw4csScAbaMhQdK6dmv9jQOzE5sE-cLR2YRUIrRvU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -20920,45 +20446,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 40960 + LdsNumBytes: 32768 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 16384 LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 73728 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -20968,15 +20494,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20990,37 +20516,39 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21028,33 +20556,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 87 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21068,28 +20596,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -21098,16 +20629,16 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -21117,7 +20648,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xWktQ7pkAc4eXErbVdZeJISaskwk40DfNVmN4lX1TIVk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x48x64_MI16x1MqDWekebqrvXnO-Sb3hzu3z0gjkhgQRF_inqkjHmDjg= BufferLoad: true BufferStore: true CUCount: null @@ -21127,17 +20658,17 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -21160,106 +20691,108 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 128 + LSCA: 32 + LSCB: 16 LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 + LSPB: 16 + LVCA: 8 + LVCB: 4 LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 22528 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -21268,39 +20801,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 88 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM24_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21308,30 +20841,33 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -21343,8 +20879,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -21357,7 +20893,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16x_DAw4csScAbaMhQdK6dmv9jQOzE5sE-cLR2YRUIrRvU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xgmSt2MoRmL0g9WeUHU2DTu1EingvlwMUOcXgdGK6Qso= BufferLoad: true BufferStore: true CUCount: null @@ -21382,7 +20918,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -21390,7 +20926,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -21400,48 +20936,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 32 + LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -21453,19 +20989,19 @@ MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -21476,23 +21012,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumTotalPackedLoadsA: -1 NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 @@ -21510,14 +21046,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 89 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC4_WGMXCCGn1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -21526,16 +21062,16 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -21555,7 +21091,7 @@ UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -21565,9 +21101,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -21576,7 +21112,7 @@ _DepthUB: 128 _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -21590,7 +21126,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -21602,7 +21138,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x48x64_MI16x1MqDWekebqrvXnO-Sb3hzu3z0gjkhgQRF_inqkjHmDjg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xcj81g2ldg0_1p4EGvdXPUPXmYszqzbqqiK6W7fsyN7o= BufferLoad: true BufferStore: true CUCount: null @@ -21612,7 +21148,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -21622,7 +21158,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -21645,48 +21181,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22528 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 22528 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22528 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -21694,23 +21230,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 48 - MacroTileA: 32 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -21721,23 +21257,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 12 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumThreads: 256 NumTotalPackedLoadsA: -1 NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 @@ -21746,7 +21282,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -21755,39 +21291,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 90 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21800,46 +21336,771 @@ UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68GNy3DOANu_i2VXAB1Y2BeloAMEwWh68Ero33soSvMM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6wJcIFiIXcL9xRCcdjHOdxHUlrFCn9hAkB7Ii07m5Oeo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21847,7 +22108,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xgmSt2MoRmL0g9WeUHU2DTu1EingvlwMUOcXgdGK6Qso= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6MFyMQpdiVOcqKVaa7dbJIhJPdEmQ0U9Tqd7zT-vG3WY= BufferLoad: true BufferStore: true CUCount: null @@ -21857,72 +22118,72 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 + LdsNumBytes: 114688 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -21930,7 +22191,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21938,15 +22199,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21958,7 +22219,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -21966,25 +22227,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22000,33 +22260,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 91 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC1_WGMXCCGn1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 2 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22040,41 +22301,38 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -22084,7 +22342,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22092,7 +22350,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xcj81g2ldg0_1p4EGvdXPUPXmYszqzbqqiK6W7fsyN7o= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT13u0IoyZA0H6JeQsxYx5dKpo7nQ9hoyYgsEHeT0aCVTI= BufferLoad: true BufferStore: true CUCount: null @@ -22102,27 +22360,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -22132,50 +22390,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22183,15 +22441,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22203,33 +22461,32 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22245,33 +22502,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 92 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22285,46 +22543,43 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -22337,7 +22592,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68GNy3DOANu_i2VXAB1Y2BeloAMEwWh68Ero33soSvMM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3nYUFHmrSHNp-n-T71OSxpo2-QMWXfro-CzcYssiDH2w= BufferLoad: true BufferStore: true CUCount: null @@ -22347,10 +22602,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -22368,7 +22623,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -22380,43 +22635,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 114688 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 114688 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -22428,15 +22683,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22450,7 +22705,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -22463,15 +22718,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -22489,38 +22744,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 93 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -22532,26 +22787,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -22560,16 +22815,16 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -22579,7 +22834,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6wJcIFiIXcL9xRCcdjHOdxHUlrFCn9hAkB7Ii07m5Oeo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VJH_RBajY8L_kN5xvMaq7RQvJq3SsD2MoUk710Se8-I= BufferLoad: true BufferStore: true CUCount: null @@ -22589,10 +22844,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -22618,47 +22873,47 @@ ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 73728 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 147456 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 147456 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -22670,15 +22925,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22692,7 +22947,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -22706,14 +22961,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsCoalescedB: 7 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -22731,13 +22986,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 94 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -22747,18 +23002,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 1 + ThreadTile1: 7 ThreadTileA: 32 - ThreadTileB: 1 + ThreadTileB: 7 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22784,16 +23039,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -22807,11 +23062,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -22821,6 +23076,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6rqfD3ghhAs2e9z1zuI_aBgcgETHE2YH-hIjPMwl5H1A= BufferLoad: true BufferStore: true CUCount: null @@ -22830,7 +23086,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -22863,39 +23119,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -22911,15 +23167,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22947,14 +23203,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -22972,13 +23228,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 95 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -22989,17 +23245,17 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 32 + SubGroup1: 128 SubGroupA: 2 - SubGroupB: 32 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23015,26 +23271,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -23062,7 +23318,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6MFyMQpdiVOcqKVaa7dbJIhJPdEmQ0U9Tqd7zT-vG3WY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6M8k0zgCr79GezsF4koVKCwmzswSUcG00QTbkQGZOzc4= BufferLoad: true BufferStore: true CUCount: null @@ -23072,7 +23328,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -23105,39 +23361,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -23153,15 +23409,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [1, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23184,18 +23440,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 @@ -23214,13 +23470,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 96 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -23231,9 +23487,9 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -23257,7 +23513,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -23267,16 +23523,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -23304,7 +23560,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3nYUFHmrSHNp-n-T71OSxpo2-QMWXfro-CzcYssiDH2w= BufferLoad: true BufferStore: true CUCount: null @@ -23314,10 +23569,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -23335,7 +23590,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -23347,43 +23602,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 49152 - LdsNumElementsAlignedB: 12288 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 49152 - LdsOffsetB_Blk: 114688 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49152 - LdsOffsetMetadata_Blk: 114688 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -23395,15 +23650,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 96 - MacroTileA: 384 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23417,7 +23672,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -23430,15 +23685,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 12 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -23456,38 +23711,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 97 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 3 - ThreadTileA: 48 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -23499,26 +23754,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -23532,11 +23787,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -23546,7 +23801,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VJH_RBajY8L_kN5xvMaq7RQvJq3SsD2MoUk710Se8-I= BufferLoad: true BufferStore: true CUCount: null @@ -23556,7 +23810,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -23589,39 +23843,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 57344 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 147456 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 147456 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -23637,15 +23891,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 7] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 7 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 448 - MacroTileA: 128 - MacroTileB: 448 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23673,14 +23927,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 112 - NumLoadsA: 4 - NumLoadsB: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 7 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -23698,13 +23952,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 98 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -23714,22 +23968,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 7 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 7 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -23741,7 +23995,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -23751,16 +24005,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -23769,8 +24023,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -23788,7 +24042,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6rqfD3ghhAs2e9z1zuI_aBgcgETHE2YH-hIjPMwl5H1A= BufferLoad: true BufferStore: true CUCount: null @@ -23798,7 +24051,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -23831,39 +24084,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -23879,15 +24132,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23915,14 +24168,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -23940,13 +24193,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 99 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -23957,17 +24210,17 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23983,26 +24236,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -24018,7 +24271,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -24030,7 +24283,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6M8k0zgCr79GezsF4koVKCwmzswSUcG00QTbkQGZOzc4= BufferLoad: true BufferStore: true CUCount: null @@ -24041,9 +24293,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -24073,8 +24325,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -24086,9 +24338,9 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 131072 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 @@ -24099,7 +24351,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 + LdsOffsetMetadata: 65536 LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 @@ -24108,8 +24360,8 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -24143,7 +24395,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -24182,8 +24434,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 100 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -24213,7 +24465,7 @@ TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -24225,7 +24477,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -24236,7 +24488,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -24253,14 +24505,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -24281,10 +24533,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -24314,43 +24566,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 LSCA: 64 - LSCB: 128 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 + LVCB: 8 LVPA: 4 - LVPB: 2 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 98304 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 98304 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 49152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 163840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 163840 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -24362,15 +24614,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24384,7 +24636,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -24398,14 +24650,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -24423,13 +24675,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 101 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -24440,21 +24692,21 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -24470,22 +24722,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -24494,13 +24746,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false @@ -24555,24 +24807,24 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 768 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49152 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 49152 + LdsNumBytes: 59392 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 26624 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -24581,46 +24833,46 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49152 + LdsOffsetMetadata: 59392 LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 48 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -24639,14 +24891,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 + NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -24664,8 +24916,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 102 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -24680,18 +24932,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24707,7 +24959,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -24717,7 +24969,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 4, 2] WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -24754,6 +25006,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Yk7kOAJB_aE6VWqrVWf__KLZBXdHS4G4A_wz_1sHzCU= BufferLoad: true BufferStore: true CUCount: null @@ -24763,10 +25016,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -24784,7 +25037,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -24796,48 +25049,48 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 16 - LVCB: 16 + LVCA: 4 + LVCB: 4 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -24845,51 +25098,51 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 - NumThreads: 256 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24905,38 +25158,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 103 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -24951,23 +25204,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -24976,14 +25229,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -24995,6 +25248,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3to8cjWEs6y4SguQB5Eo9GwEWnTleft3p6-QhBINReWU= BufferLoad: true BufferStore: true CUCount: null @@ -25005,9 +25259,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -25025,7 +25279,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -25037,98 +25291,98 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 65536 LdsInitCVgprs: false LdsNumBytes: 65536 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 65536 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -25146,8 +25400,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 104 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -25157,27 +25411,27 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -25189,18 +25443,18 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 8 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -25222,9 +25476,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -25245,10 +25499,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -25278,34 +25532,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98304 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 98304 + LdsNumBytes: 114688 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 + LdsOffsetA_Blk: 65536 LdsOffsetB: 32768 - LdsOffsetB_Blk: 163840 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 81920 - LdsOffsetMetadata_Blk: 163840 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -25313,13 +25567,13 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -25327,34 +25581,34 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 @@ -25362,15 +25616,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 8 - NumLoadsB: 12 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25387,13 +25640,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 105 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -25403,22 +25656,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -25430,7 +25683,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -25440,16 +25693,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -25458,13 +25711,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -25477,7 +25730,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Yk7kOAJB_aE6VWqrVWf__KLZBXdHS4G4A_wz_1sHzCU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3eZ5RqKxB1O3236AeikNxnkmOhdKPC4D_VFkvtbj1K28= BufferLoad: true BufferStore: true CUCount: null @@ -25487,10 +25740,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -25520,43 +25773,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -25568,15 +25821,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25590,7 +25843,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -25613,7 +25866,7 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 - NumThreads: 64 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25629,13 +25882,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 106 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -25645,22 +25898,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -25672,26 +25925,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -25700,13 +25953,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -25719,7 +25972,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3to8cjWEs6y4SguQB5Eo9GwEWnTleft3p6-QhBINReWU= BufferLoad: true BufferStore: true CUCount: null @@ -25762,8 +26014,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -25838,8 +26090,8 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 @@ -25871,8 +26123,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 107 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -25914,7 +26166,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -25925,7 +26177,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -25942,8 +26194,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -25961,6 +26213,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WumQmrRZO9w1nG8nVI-z2VGL_a-D9ZwkiDz46zvf_bQ= BufferLoad: true BufferStore: true CUCount: null @@ -25970,7 +26223,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -25991,7 +26244,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -26003,45 +26256,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -26052,13 +26305,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -26071,7 +26324,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -26087,15 +26340,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26111,18 +26365,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 108 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -26135,9 +26389,9 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -26154,26 +26408,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 6 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -26189,7 +26443,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -26201,7 +26455,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3eZ5RqKxB1O3236AeikNxnkmOhdKPC4D_VFkvtbj1K28= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT348doMK9QEbWRou0upuvJNPJpi_evI_jhEzrHZU7JHfw= BufferLoad: true BufferStore: true CUCount: null @@ -26211,10 +26465,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -26232,7 +26486,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -26244,43 +26498,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 53248 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -26292,15 +26546,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26312,9 +26566,9 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -26323,20 +26577,19 @@ NonTemporalA: 0 NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26353,34 +26606,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 109 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26396,26 +26649,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -26424,13 +26677,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -26452,7 +26705,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -26473,7 +26726,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -26485,7 +26738,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 32 LSCB: 32 @@ -26498,26 +26751,26 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -26533,9 +26786,9 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 @@ -26553,7 +26806,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -26561,23 +26814,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 0 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumMbskPrefetchElements: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26594,33 +26846,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 110 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -26637,26 +26889,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 6 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -26684,7 +26936,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WumQmrRZO9w1nG8nVI-z2VGL_a-D9ZwkiDz46zvf_bQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjgFvuyb2NeEcS1wEFnStawG246sqNqAes5-pKzM548= BufferLoad: true BufferStore: true CUCount: null @@ -26727,7 +26979,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 LSCA: 16 LSCB: 16 @@ -26803,8 +27055,8 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 @@ -26836,8 +27088,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 111 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -26879,7 +27131,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -26926,7 +27178,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3oeOmeGqaW_jaSFGz_jm6w84YNB03l0N-hq9s6Jgreqc= BufferLoad: true BufferStore: true CUCount: null @@ -26969,7 +27220,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 32 LSCB: 16 @@ -27045,8 +27296,8 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 @@ -27077,8 +27328,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 112 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -27120,7 +27371,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -27131,7 +27382,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -27155,7 +27406,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -27167,7 +27418,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT348doMK9QEbWRou0upuvJNPJpi_evI_jhEzrHZU7JHfw= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3-LSPS0cmSEhfqGL2SuyEC3Fz2PkCkvfzxBANwy6XxN4= BufferLoad: true BufferStore: true CUCount: null @@ -27177,10 +27428,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -27198,7 +27449,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -27210,43 +27461,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 + LVCB: 8 LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53248 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 53248 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 53248 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -27258,15 +27509,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27278,30 +27529,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27318,34 +27570,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 113 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27361,26 +27613,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -27389,13 +27641,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -27408,6 +27660,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3JLacchZPwjeBP68mM8x4ppN98WdtQKDcx7Xp-zqDpyY= BufferLoad: true BufferStore: true CUCount: null @@ -27417,7 +27670,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -27450,39 +27703,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 0 LSCA: 32 - LSCB: 32 + LSCB: 64 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -27498,15 +27751,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [1, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27518,7 +27771,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -27534,14 +27787,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27558,13 +27812,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 114 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -27575,9 +27829,9 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -27601,7 +27855,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -27611,16 +27865,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -27648,7 +27902,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjgFvuyb2NeEcS1wEFnStawG246sqNqAes5-pKzM548= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1F29g57gDKx143xeF4Ry8TpDfmB9904AahmpgslE7iFA= BufferLoad: true BufferStore: true CUCount: null @@ -27691,8 +27945,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 LSCA: 16 LSCB: 16 LSPA: 16 @@ -27770,7 +28024,7 @@ NonTemporalA: 0 NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 @@ -27800,8 +28054,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 115 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -27843,7 +28097,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -27871,14 +28125,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -27890,6 +28144,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Va3I0gXvAjl6WSNm7VfyQxdSiay22E7f07A2gXBFahE= BufferLoad: true BufferStore: true CUCount: null @@ -27899,10 +28154,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -27920,7 +28175,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -27932,43 +28187,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 37888 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 37888 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 + LdsOffsetMetadata: 37888 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -27980,15 +28235,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 128 + MacroTileA: 16 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28000,30 +28255,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 0 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28040,34 +28296,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 116 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28086,23 +28342,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -28116,9 +28372,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -28130,7 +28386,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3-LSPS0cmSEhfqGL2SuyEC3Fz2PkCkvfzxBANwy6XxN4= BufferLoad: true BufferStore: true CUCount: null @@ -28140,10 +28395,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -28173,43 +28428,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 46080 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 78848 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 78848 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -28221,15 +28476,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28243,28 +28498,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -28282,13 +28537,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 117 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -28298,17 +28553,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 12 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 12 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -28335,16 +28590,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -28358,9 +28613,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -28372,7 +28627,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3JLacchZPwjeBP68mM8x4ppN98WdtQKDcx7Xp-zqDpyY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Pg_BkawruRCL1XGe64CaI5c_MySTZSQHvn2xslbYUME= BufferLoad: true BufferStore: true CUCount: null @@ -28382,10 +28637,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -28403,7 +28658,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -28415,43 +28670,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 128 LSCB: 64 - LSPA: 32 + LSPA: 8 LSPB: 16 - LVCA: 8 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -28463,14 +28718,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -28485,7 +28740,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -28494,19 +28749,19 @@ NonTemporalA: 0 NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -28524,33 +28779,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 118 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -28570,23 +28825,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -28600,11 +28855,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -28614,7 +28869,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1F29g57gDKx143xeF4Ry8TpDfmB9904AahmpgslE7iFA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kiD3-1VtWxVcsUpwA9y_XMbufpwlDdmtaKQ75wO2y28= BufferLoad: true BufferStore: true CUCount: null @@ -28624,7 +28879,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -28642,10 +28897,10 @@ GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -28657,75 +28912,75 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 64 + LSPA: 8 LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -28734,23 +28989,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumMbskPrefetchElements: 16 - NumThreads: 64 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -28766,34 +29020,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 119 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false - SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28809,27 +29062,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [64, 2, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBufferSingleKernel + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -28844,9 +29097,9 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -28856,7 +29109,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Va3I0gXvAjl6WSNm7VfyQxdSiay22E7f07A2gXBFahE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yMCAGzp9nwTbEima2Un3PvZ0HXHBQIMvg1dFgzYWa_Y= BufferLoad: true BufferStore: true CUCount: null @@ -28866,10 +29119,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -28884,10 +29137,10 @@ GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -28899,99 +29152,98 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 + LSCA: 128 LSCB: 128 - LSPA: 64 + LSPA: 8 LSPB: 8 - LVCA: 4 + LVCA: 32 LVCB: 32 - LVPA: 16 + LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 37888 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 37888 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 70656 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37888 - LdsOffsetMetadata_Blk: 70656 - LdsPadA: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumMbskPrefetchElements: 16 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29008,18 +29260,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 120 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -29030,11 +29282,10 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false - SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -29051,27 +29302,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBufferSingleKernel + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -29084,8 +29335,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false @@ -29098,6 +29349,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qduvuzsXPPzFHa_P72l7BR-VshFrlFZLdSkdePVgfp4= BufferLoad: true BufferStore: true CUCount: null @@ -29107,7 +29359,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -29125,10 +29377,10 @@ GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -29140,35 +29392,35 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 128 - LSPA: 64 - LSPB: 8 - LVCA: 4 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 2048 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 46080 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 46080 - LdsNumElementsAlignedA: 13312 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 13312 - LdsOffsetB_Blk: 78848 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 78848 - LdsPadA: 16 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -29178,37 +29430,37 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 128 - MacroTileA: 48 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -29217,22 +29469,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 8 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumMbskPrefetchElements: 16 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29249,18 +29500,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 121 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -29271,12 +29522,11 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false - SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 2 - ThreadTileA: 12 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29292,27 +29542,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBufferSingleKernel + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -29339,7 +29589,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yMCAGzp9nwTbEima2Un3PvZ0HXHBQIMvg1dFgzYWa_Y= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1LmAvRmupSNgrSah59TXAt88n3c4wrKF6OZh9XiWLgCg= BufferLoad: true BufferStore: true CUCount: null @@ -29349,7 +29599,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -29382,7 +29632,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 LSCB: 128 @@ -29395,21 +29645,21 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -29419,8 +29669,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] @@ -29461,19 +29711,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29490,13 +29740,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 122 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -29548,10 +29798,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -29579,7 +29829,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qduvuzsXPPzFHa_P72l7BR-VshFrlFZLdSkdePVgfp4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1npxiDxAnSPzJZrvFG5oOPfnm5vME6tIZYSSv4aPUpfk= BufferLoad: true BufferStore: true CUCount: null @@ -29622,7 +29872,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 LSCB: 64 @@ -29730,8 +29980,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 123 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -29760,7 +30010,7 @@ TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -29807,7 +30057,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -29819,7 +30069,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1LmAvRmupSNgrSah59TXAt88n3c4wrKF6OZh9XiWLgCg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vIn45BtBGJJHgOeHh-5A8gVBnOI_bnqOc5L-OKq7gYA= BufferLoad: true BufferStore: true CUCount: null @@ -29829,10 +30079,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -29862,34 +30112,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -29897,10 +30147,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] @@ -29911,14 +30161,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 192 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29932,7 +30182,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -29941,19 +30191,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29970,13 +30220,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 124 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -29994,13 +30244,13 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -30012,11 +30262,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -30028,10 +30278,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -30045,9 +30295,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -30059,7 +30309,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1npxiDxAnSPzJZrvFG5oOPfnm5vME6tIZYSSv4aPUpfk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1JYtzm5Xt3uGxXdOwg30qeZ_F0x7lSMwRDJAgKTAi_Gc= BufferLoad: true BufferStore: true CUCount: null @@ -30070,9 +30320,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -30102,34 +30352,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 LSCA: 128 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 40960 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -30137,8 +30387,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -30151,14 +30401,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 192 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -30172,7 +30422,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -30186,14 +30436,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -30210,8 +30460,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 125 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -30234,13 +30484,13 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -30252,11 +30502,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -30280,16 +30530,16 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -30299,7 +30549,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vIn45BtBGJJHgOeHh-5A8gVBnOI_bnqOc5L-OKq7gYA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ljCbO7E0xnyQwh63y6jBAf9qB84fAz-ikx5KuEYBkO4= BufferLoad: true BufferStore: true CUCount: null @@ -30310,9 +30560,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -30342,34 +30592,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 40960 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -30377,8 +30627,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -30391,14 +30641,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 192 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -30412,7 +30662,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -30426,14 +30676,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -30450,8 +30700,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 126 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -30474,13 +30724,13 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -30492,11 +30742,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -30525,9 +30775,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -30539,7 +30789,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1JYtzm5Xt3uGxXdOwg30qeZ_F0x7lSMwRDJAgKTAi_Gc= BufferLoad: true BufferStore: true CUCount: null @@ -30550,9 +30799,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -30570,7 +30819,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -30582,34 +30831,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 128 + LSCA: 64 LSCB: 128 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -30617,8 +30866,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -30631,13 +30880,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 192 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 192 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -30652,7 +30901,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -30665,14 +30914,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -30690,8 +30939,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 127 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -30701,7 +30950,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -30713,9 +30962,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 48 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 48 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -30735,7 +30984,7 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -30743,7 +30992,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -30765,9 +31014,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -30779,7 +31028,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ljCbO7E0xnyQwh63y6jBAf9qB84fAz-ikx5KuEYBkO4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xO78SfH8mQcoIMGAjMin7gcWpXOlIKbkUU6oKtuOViE= BufferLoad: true BufferStore: true CUCount: null @@ -30790,9 +31039,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -30810,7 +31059,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -30822,34 +31071,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 64 LSCB: 128 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -30857,8 +31106,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -30871,13 +31120,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 192 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 192 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -30892,7 +31141,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -30905,14 +31154,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -30930,8 +31179,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 128 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -30941,7 +31190,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -30953,9 +31202,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 48 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 48 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -30975,7 +31224,7 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -31005,11 +31254,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -31019,6 +31268,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT95yVDjp7dKdvEwAelX8U-m0LR7_oTUhNUc468WYqSMis= BufferLoad: true BufferStore: true CUCount: null @@ -31061,34 +31311,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 LSCB: 128 - LSPA: 16 + LSPA: 32 LSPB: 8 - LVCA: 16 + LVCA: 8 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 40960 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 77824 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 77824 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -31109,15 +31359,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31145,13 +31395,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 6 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 3 + NumLoadsB: 12 NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -31169,8 +31419,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 129 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -31185,17 +31435,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 48 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 48 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -31211,18 +31461,18 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -31239,8 +31489,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -31258,7 +31508,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xO78SfH8mQcoIMGAjMin7gcWpXOlIKbkUU6oKtuOViE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vlnE_4luus1zCUHcypOmdrR_sbwtiIA8OS0tI-q8Ru0= BufferLoad: true BufferStore: true CUCount: null @@ -31301,7 +31551,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 LSCB: 128 @@ -31409,8 +31659,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 130 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -31439,7 +31689,7 @@ TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -31451,7 +31701,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -31498,7 +31748,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT95yVDjp7dKdvEwAelX8U-m0LR7_oTUhNUc468WYqSMis= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1SDfMH-27Y7mEuNmRwG8kvOzYZofhtz1GHOgEQ1JLt60= BufferLoad: true BufferStore: true CUCount: null @@ -31508,10 +31758,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -31541,97 +31791,97 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 128 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 32 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 12288 - LdsNumElementsAlignedB: 49152 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 12288 - LdsOffsetB_Blk: 77824 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12288 - LdsOffsetMetadata_Blk: 77824 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 384 - MacroTileA: 96 - MacroTileB: 384 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 3 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -31649,13 +31899,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 131 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -31665,17 +31915,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 3 - ThreadTileA: 48 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -31691,7 +31941,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -31701,16 +31951,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -31719,16 +31969,16 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -31738,6 +31988,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3kn1MecExrukXO0aoF0LQyTOFFiyvmj1lAoIdfg5BGCM= BufferLoad: true BufferStore: true CUCount: null @@ -31747,7 +31998,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -31780,72 +32031,72 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -31857,21 +32108,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -31888,13 +32139,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 132 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -31905,15 +32156,15 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -31940,16 +32191,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -31958,14 +32209,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -31977,7 +32228,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vlnE_4luus1zCUHcypOmdrR_sbwtiIA8OS0tI-q8Ru0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3FvQPs5Yo_G_QmVYzG6yS8nTnH_iGTUQcTqM20yP63Jg= BufferLoad: true BufferStore: true CUCount: null @@ -31987,10 +32238,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -32008,7 +32259,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -32020,97 +32271,97 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 40960 - LdsNumElementsAlignedA: 24576 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -32128,37 +32379,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 133 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -32170,26 +32421,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -32203,9 +32454,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -32217,7 +32468,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1SDfMH-27Y7mEuNmRwG8kvOzYZofhtz1GHOgEQ1JLt60= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qHETGII6d0uSQZoEXMbPslbRwZtZ2qEIdMes7tMaKUM= BufferLoad: true BufferStore: true CUCount: null @@ -32227,10 +32478,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -32248,7 +32499,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -32260,43 +32511,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 32 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 768 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 46080 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 13312 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -32308,15 +32559,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32330,7 +32581,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -32339,19 +32590,19 @@ NonTemporalA: 0 NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -32368,33 +32619,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 134 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32410,26 +32661,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -32438,13 +32689,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -32457,7 +32708,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3FvQPs5Yo_G_QmVYzG6yS8nTnH_iGTUQcTqM20yP63Jg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4fXWWSVAh5Sx3ln-dukIdf9DQU0-ZZbM_XZiPevYyhHg= BufferLoad: true BufferStore: true CUCount: null @@ -32467,9 +32718,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -32488,7 +32739,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -32500,42 +32751,42 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 111616 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 111616 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 78848 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 + LdsOffsetMetadata: 13312 + LdsOffsetMetadata_Blk: 78848 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 @@ -32548,15 +32799,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32570,28 +32821,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -32608,33 +32859,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 135 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32650,26 +32901,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -32683,9 +32934,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -32697,7 +32948,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qHETGII6d0uSQZoEXMbPslbRwZtZ2qEIdMes7tMaKUM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT30Yb2FxBYbeBYzDmCEQ-2xgTMHV7OuGTt-uS6NPWiiK4= BufferLoad: true BufferStore: true CUCount: null @@ -32707,10 +32958,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -32738,45 +32989,45 @@ InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 768 + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 46080 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 46080 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 13312 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -32788,15 +33039,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] + MIWaveGroup: [1, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32810,28 +33061,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -32848,13 +33099,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 136 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -32864,17 +33115,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 1 ThreadTileA: 8 - ThreadTileB: 3 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32890,7 +33141,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -32900,16 +33151,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -32923,8 +33174,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -32980,7 +33231,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 LSCB: 128 @@ -33088,8 +33339,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 137 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -33220,7 +33471,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 64 @@ -33328,8 +33579,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 138 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -33460,7 +33711,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 64 @@ -33568,8 +33819,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 139 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -33647,27 +33898,27 @@ tailLoopOptB: false - [2, 3, 0, 1] - - - [120, 256, 1, 8192] - - [31, 0.0] + - [133, 15.85] - - [128, 256, 1, 8192] - [0, 0.0] - - [128, 2440, 1, 8192] - - [32, 0.0] + - [120, 54.86] - - [128, 5120, 1, 8192] - - [122, 68.21] + - [121, 68.21] - - [128, 5640, 1, 8192] - - [123, 70.77] + - [122, 70.77] - - [256, 120, 1, 8192] - - [91, 35579.9] + - [87, 35579.9] - - [256, 256, 1, 8192] - - [34, 0.0] + - [30, 0.0] - - [256, 512, 1, 8192] - [1, 0.0] - - [256, 528, 1, 8192] - - [136, 37.1] + - [135, 37.1] - - [256, 2048, 1, 8192] - - [124, 72.57] + - [123, 72.57] - - [512, 120, 1, 8192] - - [92, 54795.5] + - [88, 54795.5] - - [512, 128, 1, 8192] - [2, 0.0] - - [512, 256, 1, 8192] @@ -33675,259 +33926,259 @@ - - [512, 512, 1, 8192] - [4, 0.0] - - [512, 528, 1, 8192] - - [35, 0.0] + - [31, 0.0] - - [512, 1980, 1, 8192] - - [36, 0.0] + - [32, 0.0] - - [512, 2048, 1, 8192] - [5, 0.0] - - [528, 256, 1, 8192] - - [37, 0.0] + - [136, 36.39] - - [1024, 512, 1, 8192] - - [38, 0.0] + - [33, 0.0] - - [1980, 512, 1, 8192] - - [39, 0.0] + - [34, 0.0] - - [2048, 512, 1, 8192] - [6, 0.0] - - [2820, 5640, 1, 8192] - [7, 0.0] - - [3072, 512, 1, 8192] - - [41, 0.0] + - [36, 0.0] - - [3960, 512, 1, 8192] - - [42, 0.0] + - [37, 0.0] - - [4352, 128, 1, 8192] - - [127, 65.33] + - [126, 65.33] - - [4352, 256, 1, 8192] - - [43, 0.0] + - [38, 0.0] - - [4608, 256, 1, 8192] - - [44, 0.0] + - [39, 0.0] - - [5120, 128, 1, 8192] - - [128, 66.51] + - [127, 66.51] - - [5640, 128, 1, 8192] - - [129, 68.8] + - [128, 68.8] - - [5640, 2820, 1, 8192] - [8, 0.0] - - [6912, 128, 1, 8192] - - [130, 71.97] + - [129, 71.97] - - [7296, 128, 1, 8192] - - [45, 0.0] + - [40, 0.0] - - [10880, 128, 1, 8192] - [9, 0.0] - - [4, 128, 8192, 30] - [10, 0.0] - - [16, 128, 8192, 33] - - [120, 6.95] + - [117, 6.95] - - [40, 128, 8192, 64] - - [121, 17.71] + - [118, 17.71] - - [128, 128, 1, 17711] - - [58, 0.0] + - [54, 0.0] - - [128, 960, 1, 17711] - - [126, 52.82] + - [125, 52.82] - - [128, 2480, 1, 17711] - - [125, 67.94] + - [124, 67.94] - - [128, 17711, 1, 41] - - [99, 10.05] + - [96, 10.05] - - [384, 17711, 1, 246] - - [73, 0.0] + - [93, 58.97] - - [384, 17711, 1, 768] - - [97, 80.41] + - [94, 80.41] - - [768, 96, 1, 17711] - - [77, 0.0] + - [72, 0.0] - - [887, 256, 1, 17711] - - [132, 65.02] + - [73, 0.0] - - [928, 128, 1, 17711] - - [133, 50.17] + - [131, 50.17] - - [2732, 384, 1, 17711] - - [131, 93.64] + - [130, 93.64] - - [28, 256, 1, 4096] - - [119, 2.75] + - [116, 2.75] - - [28, 320, 1, 4096] - - [115, 3.3] + - [112, 3.3] - - [64, 512, 1, 4096] - - [113, 10.53] + - [110, 10.53] - - [72, 256, 1, 4096] - - [112, 6.35] + - [46, 0.0] - - [72, 320, 1, 4096] - - [112, 7.69] + - [47, 0.0] - - [80, 512, 1, 4096] - - [107, 12.45] + - [105, 12.45] - - [96, 512, 1, 4096] - - [109, 14.97] + - [107, 14.97] - - [116, 256, 1, 4096] - - [54, 0.0] + - [50, 0.0] - - [116, 320, 1, 4096] - - [55, 0.0] + - [51, 0.0] - - [128, 2048, 1, 4096] - - [100, 46.7] + - [97, 46.7] - - [160, 512, 1, 4096] - - [118, 20.26] + - [115, 20.26] - - [180, 256, 1, 4096] - - [62, 0.0] + - [58, 0.0] - - [180, 320, 1, 4096] - - [63, 0.0] + - [59, 0.0] - - [256, 28, 1, 4096] - - [111, 2.75] + - [109, 2.75] - - [256, 72, 1, 4096] - - [108, 6.24] + - [106, 6.24] - - [256, 116, 1, 4096] - - [116, 9.6] + - [113, 9.6] - - [256, 180, 1, 4096] - - [110, 13.36] + - [108, 13.36] - - [256, 256, 1, 4096] - - [114, 18.23] + - [111, 18.23] - - [256, 7680, 1, 4096] - - [71, 0.0] + - [67, 0.0] - - [512, 160, 1, 4096] - - [117, 19.51] + - [114, 19.51] - - [512, 512, 1, 4096] - [11, 0.0] - - [512, 2246, 1, 4096] - - [75, 0.0] + - [70, 0.0] - - [1600, 128, 1, 4096] - - [104, 36.46] + - [101, 36.46] - - [1824, 2048, 1, 4096] - [12, 0.0] - - [2048, 57, 1, 4096] - - [79, 0.0] + - [75, 0.0] - - [2048, 64, 1, 4096] - - [80, 0.0] + - [76, 0.0] - - [2048, 82, 1, 4096] - - [13, 0.0] + - [103, 28.4] - - [2048, 160, 1, 4096] - - [105, 44.82] + - [102, 44.82] - - [2048, 2048, 1, 4096] - - [14, 0.0] + - [13, 0.0] - - [2246, 512, 1, 4096] - - [15, 0.0] + - [14, 0.0] - - [4132, 256, 1, 4096] - - [81, 0.0] + - [77, 0.0] - - [4132, 512, 1, 4096] - - [82, 0.0] + - [78, 0.0] - - [7680, 256, 1, 4096] - - [16, 0.0] + - [15, 0.0] - - [7680, 512, 1, 4096] - - [17, 0.0] + - [16, 0.0] - - [28, 32, 8192, 28] - - [83, 0.0] + - [79, 0.0] - - [32, 25, 8192, 25] - - [84, 0.0] + - [80, 0.0] - - [32, 64, 4096, 57] - - [85, 0.0] + - [81, 0.0] - - [32, 64, 4096, 82] - - [86, 0.0] + - [82, 0.0] - - [48, 160, 4096, 192] - - [18, 0.0] + - [17, 0.0] - - [48, 160, 4096, 642] - - [19, 0.0] + - [18, 0.0] - - [64, 200, 4096, 32] - - [87, 0.0] + - [83, 0.0] - - [160, 64, 96, 4096] - - [20, 0.0] + - [19, 0.0] - - [200, 64, 4096, 32] - - [21, 0.0] + - [119, 14.33] - - [8, 256, 1, 2048] - - [22, 0.0] + - [20, 0.0] - - [16, 256, 1, 2048] - - [23, 0.0] + - [21, 0.0] - - [32, 256, 1, 2048] - - [24, 0.0] + - [22, 0.0] - - [36, 256, 1, 2048] - - [25, 0.0] + - [23, 0.0] - - [40, 256, 1, 2048] - - [26, 0.0] + - [24, 0.0] - - [48, 256, 1, 2048] - - [27, 0.0] + - [25, 0.0] - - [64, 256, 1, 2048] - - [28, 0.0] + - [26, 0.0] - - [72, 256, 1, 2048] - - [29, 0.0] + - [27, 0.0] - - [80, 256, 1, 2048] - - [30, 0.0] + - [28, 0.0] - - [96, 256, 1, 2048] - - [89, 9584.86] + - [85, 9584.86] - - [128, 256, 1, 2048] - - [59, 0.0] + - [55, 0.0] - - [256, 128, 1, 2048] - - [68, 0.0] + - [64, 0.0] - - [256, 256, 1, 2048] - - [70, 0.0] + - [66, 0.0] - - [64, 128, 1, 8192] - - [134, 5.16] + - [132, 5.16] - - [128, 128, 1, 8192] - - [135, 9.41] + - [134, 9.41] - - [256, 128, 1, 98304] - - [33, 0.0] + - [29, 0.0] - - [1980, 1024, 1, 8192] - - [40, 0.0] + - [35, 0.0] - - [57, 32, 1, 262144] - - [46, 0.0] + - [137, 14.94] - - [64, 64, 1, 102400] - - [47, 0.0] + - [41, 0.0] - - [64, 64, 1, 131072] - - [48, 0.0] + - [42, 0.0] - - [64, 64, 1, 819200] - - [49, 0.0] + - [43, 0.0] - - [64, 128, 1, 1024] - - [106, 1.02] + - [104, 1.02] - - [64, 128, 1, 131072] - - [50, 0.0] + - [44, 0.0] - - [72, 128, 1, 1024] - - [51, 0.0] + - [45, 0.0] - - [82, 32, 1, 262144] - - [52, 0.0] + - [48, 0.0] - - [96, 128, 1, 1024] - - [53, 0.0] + - [49, 0.0] - - [128, 64, 1, 131072] - - [56, 0.0] + - [52, 0.0] - - [128, 128, 1, 1024] - - [57, 0.0] + - [53, 0.0] - - [128, 4096, 1, 1024] - - [94, 36.1] + - [90, 36.1] - - [128, 7456, 1, 1024] - - [93, 49.79] + - [89, 49.79] - - [144, 128, 1, 1024] - - [60, 0.0] + - [56, 0.0] - - [160, 10, 1, 655360] - - [61, 0.0] + - [57, 0.0] - - [192, 48, 1, 655360] - - [64, 0.0] + - [60, 0.0] - - [192, 112, 1, 655360] - - [137, 61.51] + - [138, 61.51] - - [224, 64, 1, 527553] - - [138, 49.28] + - [139, 49.28] - - [224, 64, 1, 752863] - - [139, 52.03] + - [140, 52.03] - - [233, 56, 1, 131072] - - [65, 0.0] + - [61, 0.0] - - [252, 128, 1, 17711] - - [66, 0.0] + - [62, 0.0] - - [256, 128, 1, 1024] - - [67, 0.0] + - [63, 0.0] - - [256, 128, 1, 17711] - - [69, 0.0] + - [65, 0.0] - - [256, 7968, 1, 1024] - - [96, 65.32] + - [92, 65.32] - - [288, 64, 1, 806154] - - [72, 0.0] + - [68, 0.0] - - [512, 128, 1, 1024] - - [74, 0.0] + - [69, 0.0] - - [512, 2011, 1, 1024] - - [95, 52.72] + - [91, 52.72] - - [642, 304, 1, 655360] - - [76, 0.0] + - [71, 0.0] - - [1024, 128, 1, 2048] - - [78, 0.0] + - [74, 0.0] - - [2011, 512, 1, 1024] - - [103, 50.39] + - [100, 50.39] - - [4096, 128, 1, 1024] - - [102, 32.27] + - [99, 32.27] - - [20, 48, 17711, 124] - - [90, 30249.3] + - [86, 30249.3] - - [128, 128, 6, 17711] - - [88, 0.0] + - [84, 0.0] - - [128, 17711, 6, 128] - - [98, 49.6] + - [95, 49.6] - - [7968, 256, 1, 1024] - - [101, 64.89] + - [98, 64.89] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml index 1acb1b7b293..06e33565602 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml @@ -79,248 +79,6 @@ UseScaleAlphaVec: 0 UseScaleCD: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AdaptiveGemm: 1 - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Yk7kOAJB_aE6VWqrVWf__KLZBXdHS4G4A_wz_1sHzCU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: 1 - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: -1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel - GlobalSplitUCoalesced: true - GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 1 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumMbskPrefetchElements: 16 - NumThreads: 64 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: 0 - enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -375,7 +133,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 LSCA: 64 LSCB: 64 @@ -484,8 +242,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -527,7 +285,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -617,7 +375,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 LSCB: 32 @@ -726,8 +484,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -769,7 +527,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -858,7 +616,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 LSCB: 64 @@ -967,8 +725,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -1010,7 +768,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -1100,7 +858,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 LSCA: 64 LSCB: 128 @@ -1209,8 +967,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -1252,7 +1010,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -1287,6 +1045,248 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT13u0IoyZA0H6JeQsxYx5dKpo7nQ9hoyYgsEHeT0aCVTI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -1826,7 +1826,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 LSCB: 128 @@ -1936,7 +1936,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -1978,7 +1978,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -2068,7 +2068,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 LSCB: 64 @@ -2178,7 +2178,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -2220,7 +2220,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -2309,7 +2309,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 LSCA: 64 LSCB: 128 @@ -2419,7 +2419,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -2461,7 +2461,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -2791,7 +2791,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 LSCB: 64 @@ -2901,7 +2901,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -2943,7 +2943,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -3460,7 +3460,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -3472,7 +3472,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3to8cjWEs6y4SguQB5Eo9GwEWnTleft3p6-QhBINReWU= BufferLoad: true BufferStore: true CUCount: null @@ -3483,10 +3482,10 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: 0 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr @@ -3503,7 +3502,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -3515,43 +3514,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 768 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 26624 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -3564,14 +3563,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3585,28 +3584,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -3625,7 +3624,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -3635,7 +3634,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -3648,14 +3647,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -3670,15 +3669,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -3700,8 +3699,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -3714,6 +3713,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Yk7kOAJB_aE6VWqrVWf__KLZBXdHS4G4A_wz_1sHzCU= BufferLoad: true BufferStore: true CUCount: null @@ -3723,10 +3723,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -3744,7 +3744,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -3756,43 +3756,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -3805,13 +3805,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -3824,31 +3824,32 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -3865,17 +3866,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -3888,14 +3889,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -3910,23 +3911,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 6 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -3935,13 +3936,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -3954,7 +3955,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3eZ5RqKxB1O3236AeikNxnkmOhdKPC4D_VFkvtbj1K28= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3to8cjWEs6y4SguQB5Eo9GwEWnTleft3p6-QhBINReWU= BufferLoad: true BufferStore: true CUCount: null @@ -3997,8 +3998,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -4107,7 +4108,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -4149,7 +4150,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -4177,8 +4178,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -4205,7 +4206,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -4226,7 +4227,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -4238,39 +4239,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 8 - LVPB: 8 + LVPB: 16 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -4286,15 +4287,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4306,7 +4307,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -4322,15 +4323,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4348,33 +4348,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4390,26 +4390,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -4437,7 +4437,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WumQmrRZO9w1nG8nVI-z2VGL_a-D9ZwkiDz46zvf_bQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3eZ5RqKxB1O3236AeikNxnkmOhdKPC4D_VFkvtbj1K28= BufferLoad: true BufferStore: true CUCount: null @@ -4480,45 +4480,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -4528,15 +4528,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4556,8 +4556,8 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 @@ -4566,14 +4566,14 @@ NumElementsPerBatchStore: 8 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 - NumThreads: 64 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -4590,7 +4590,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -4605,18 +4605,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4632,17 +4632,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -4679,7 +4679,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3oeOmeGqaW_jaSFGz_jm6w84YNB03l0N-hq9s6Jgreqc= BufferLoad: true BufferStore: true CUCount: null @@ -4689,7 +4688,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -4710,7 +4709,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -4722,39 +4721,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 + LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 8 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -4770,15 +4769,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4790,7 +4789,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -4798,22 +4797,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4831,33 +4831,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4873,26 +4873,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -4908,7 +4908,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -4920,7 +4920,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT348doMK9QEbWRou0upuvJNPJpi_evI_jhEzrHZU7JHfw= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WumQmrRZO9w1nG8nVI-z2VGL_a-D9ZwkiDz46zvf_bQ= BufferLoad: true BufferStore: true CUCount: null @@ -4930,10 +4930,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -4951,7 +4951,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -4963,15 +4963,257 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 + LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 LVCB: 4 - LVPA: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT348doMK9QEbWRou0upuvJNPJpi_evI_jhEzrHZU7JHfw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 LVPB: 16 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 @@ -5071,7 +5313,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 20 + SolutionIndex: 21 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -5203,7 +5445,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 32 LSCB: 32 @@ -5311,8 +5553,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -5354,7 +5596,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -5444,7 +5686,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 LSCA: 16 LSCB: 16 @@ -5553,8 +5795,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -5596,7 +5838,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -5685,7 +5927,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 32 LSCB: 16 @@ -5793,8 +6035,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -5836,7 +6078,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -5926,7 +6168,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 LSCA: 32 LSCB: 32 @@ -6035,8 +6277,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -6078,7 +6320,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -6168,7 +6410,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 0 LSCA: 32 LSCB: 64 @@ -6277,8 +6519,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -6320,7 +6562,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -6410,7 +6652,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 LSCA: 16 LSCB: 16 @@ -6519,8 +6761,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -6562,7 +6804,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -6761,7 +7003,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 27 + SolutionIndex: 28 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -7002,7 +7244,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 28 + SolutionIndex: 29 SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -7080,9 +7322,9 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -7092,7 +7334,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yMCAGzp9nwTbEima2Un3PvZ0HXHBQIMvg1dFgzYWa_Y= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Pg_BkawruRCL1XGe64CaI5c_MySTZSQHvn2xslbYUME= BufferLoad: true BufferStore: true CUCount: null @@ -7102,10 +7344,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -7120,10 +7362,10 @@ GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 - GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -7135,34 +7377,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 128 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -7170,13 +7412,13 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -7184,49 +7426,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7243,32 +7486,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -7285,27 +7529,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -7318,11 +7562,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -7332,7 +7576,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qduvuzsXPPzFHa_P72l7BR-VshFrlFZLdSkdePVgfp4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kiD3-1VtWxVcsUpwA9y_XMbufpwlDdmtaKQ75wO2y28= BufferLoad: true BufferStore: true CUCount: null @@ -7342,10 +7586,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -7375,8 +7619,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 LSCA: 128 LSCB: 64 LSPA: 8 @@ -7388,30 +7632,30 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 40960 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 24576 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -7423,15 +7667,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 192 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 192 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7445,7 +7689,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -7459,14 +7703,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 6 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7483,13 +7727,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -7500,16 +7744,16 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7525,26 +7769,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [64, 2, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -7553,13 +7797,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -7572,7 +7816,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1LmAvRmupSNgrSah59TXAt88n3c4wrKF6OZh9XiWLgCg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yMCAGzp9nwTbEima2Un3PvZ0HXHBQIMvg1dFgzYWa_Y= BufferLoad: true BufferStore: true CUCount: null @@ -7582,7 +7826,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -7615,7 +7859,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 LSCB: 128 @@ -7628,21 +7872,21 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -7652,8 +7896,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] @@ -7694,19 +7938,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7723,13 +7967,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -7781,10 +8025,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -7812,7 +8056,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1npxiDxAnSPzJZrvFG5oOPfnm5vME6tIZYSSv4aPUpfk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qduvuzsXPPzFHa_P72l7BR-VshFrlFZLdSkdePVgfp4= BufferLoad: true BufferStore: true CUCount: null @@ -7855,7 +8099,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 LSCB: 64 @@ -7963,8 +8207,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -7993,7 +8237,7 @@ TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -8040,7 +8284,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -8052,7 +8296,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vIn45BtBGJJHgOeHh-5A8gVBnOI_bnqOc5L-OKq7gYA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1LmAvRmupSNgrSah59TXAt88n3c4wrKF6OZh9XiWLgCg= BufferLoad: true BufferStore: true CUCount: null @@ -8062,10 +8306,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -8095,34 +8339,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 40960 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 24576 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -8130,10 +8374,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] @@ -8144,14 +8388,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 192 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8165,7 +8409,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -8174,19 +8418,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 6 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8203,13 +8447,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 33 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -8227,13 +8471,13 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -8245,11 +8489,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -8261,10 +8505,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -8278,9 +8522,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -8292,7 +8536,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1JYtzm5Xt3uGxXdOwg30qeZ_F0x7lSMwRDJAgKTAi_Gc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1npxiDxAnSPzJZrvFG5oOPfnm5vME6tIZYSSv4aPUpfk= BufferLoad: true BufferStore: true CUCount: null @@ -8303,9 +8547,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -8335,34 +8579,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 128 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 65536 + LdsNumBytes: 40960 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -8370,8 +8614,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -8384,14 +8628,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 192 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8405,7 +8649,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -8419,14 +8663,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 + NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8443,8 +8687,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 34 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8467,13 +8711,13 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -8485,11 +8729,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -8513,16 +8757,16 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -8532,7 +8776,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ljCbO7E0xnyQwh63y6jBAf9qB84fAz-ikx5KuEYBkO4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vIn45BtBGJJHgOeHh-5A8gVBnOI_bnqOc5L-OKq7gYA= BufferLoad: true BufferStore: true CUCount: null @@ -8543,9 +8787,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -8575,34 +8819,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 65536 + LdsNumBytes: 40960 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -8610,8 +8854,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -8624,14 +8868,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 192 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8645,7 +8889,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -8659,14 +8903,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 + NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8683,8 +8927,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 35 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8707,13 +8951,13 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -8725,11 +8969,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -8758,9 +9002,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -8772,6 +9016,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1JYtzm5Xt3uGxXdOwg30qeZ_F0x7lSMwRDJAgKTAi_Gc= BufferLoad: true BufferStore: true CUCount: null @@ -8782,9 +9027,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -8802,7 +9047,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -8814,34 +9059,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 40960 - LdsNumElementsAlignedA: 24576 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -8849,8 +9094,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -8863,13 +9108,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 192 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -8884,7 +9129,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -8897,14 +9142,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -8922,8 +9167,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 36 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8933,7 +9178,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -8945,9 +9190,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 48 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -8964,10 +9209,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -8975,7 +9220,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -8997,9 +9242,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -9011,7 +9256,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xO78SfH8mQcoIMGAjMin7gcWpXOlIKbkUU6oKtuOViE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ljCbO7E0xnyQwh63y6jBAf9qB84fAz-ikx5KuEYBkO4= BufferLoad: true BufferStore: true CUCount: null @@ -9022,9 +9267,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -9042,7 +9287,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -9054,34 +9299,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 40960 - LdsNumElementsAlignedA: 24576 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -9089,8 +9334,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -9103,13 +9348,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 192 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -9124,7 +9369,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -9137,14 +9382,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -9162,8 +9407,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 37 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -9173,7 +9418,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -9185,9 +9430,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 48 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -9204,10 +9449,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -9237,11 +9482,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -9251,7 +9496,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT95yVDjp7dKdvEwAelX8U-m0LR7_oTUhNUc468WYqSMis= BufferLoad: true BufferStore: true CUCount: null @@ -9294,34 +9538,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 LSCB: 128 - LSPA: 32 + LSPA: 16 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 32 - LVPA: 8 + LVPA: 4 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 12288 - LdsNumElementsAlignedB: 49152 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 12288 - LdsOffsetB_Blk: 77824 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12288 - LdsOffsetMetadata_Blk: 77824 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -9342,15 +9586,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 384 - MacroTileA: 96 - MacroTileB: 384 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9378,13 +9622,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 6 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 3 - NumLoadsB: 12 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -9402,8 +9646,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 38 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -9418,17 +9662,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 48 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 48 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9448,14 +9692,14 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -9472,16 +9716,16 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -9491,6 +9735,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xO78SfH8mQcoIMGAjMin7gcWpXOlIKbkUU6oKtuOViE= BufferLoad: true BufferStore: true CUCount: null @@ -9501,9 +9746,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -9521,7 +9766,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -9533,34 +9778,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 LSCB: 128 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -9568,8 +9813,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -9582,13 +9827,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 192 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 192 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -9603,7 +9848,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -9616,14 +9861,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -9641,8 +9886,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -9652,7 +9897,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -9664,9 +9909,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 48 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 48 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -9683,10 +9928,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -9694,7 +9939,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -9711,16 +9956,16 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -9730,7 +9975,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vlnE_4luus1zCUHcypOmdrR_sbwtiIA8OS0tI-q8Ru0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT95yVDjp7dKdvEwAelX8U-m0LR7_oTUhNUc468WYqSMis= BufferLoad: true BufferStore: true CUCount: null @@ -9773,34 +10018,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 32 LSCB: 128 - LSPA: 16 + LSPA: 32 LSPB: 8 - LVCA: 16 + LVCA: 8 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 40960 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 77824 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 77824 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -9821,15 +10066,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9857,13 +10102,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 6 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 3 + NumLoadsB: 12 NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -9881,8 +10126,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -9897,21 +10142,21 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 48 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 48 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -9927,13 +10172,13 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -9958,9 +10203,9 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -9970,7 +10215,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1SDfMH-27Y7mEuNmRwG8kvOzYZofhtz1GHOgEQ1JLt60= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vlnE_4luus1zCUHcypOmdrR_sbwtiIA8OS0tI-q8Ru0= BufferLoad: true BufferStore: true CUCount: null @@ -9980,10 +10225,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -10013,56 +10258,296 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 32 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1SDfMH-27Y7mEuNmRwG8kvOzYZofhtz1GHOgEQ1JLt60= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 32 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57344 + LdsInitCVgprs: false + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 @@ -10121,8 +10606,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -10163,7 +10648,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -10196,8 +10681,728 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3kn1MecExrukXO0aoF0LQyTOFFiyvmj1lAoIdfg5BGCM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3FvQPs5Yo_G_QmVYzG6yS8nTnH_iGTUQcTqM20yP63Jg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qHETGII6d0uSQZoEXMbPslbRwZtZ2qEIdMes7tMaKUM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46080 + LdsInitCVgprs: false + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 13312 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -10210,7 +11415,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3FvQPs5Yo_G_QmVYzG6yS8nTnH_iGTUQcTqM20yP63Jg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4fXWWSVAh5Sx3ln-dukIdf9DQU0-ZZbM_XZiPevYyhHg= BufferLoad: true BufferStore: true CUCount: null @@ -10220,9 +11425,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -10241,7 +11446,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -10253,42 +11458,42 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 111616 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 111616 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 78848 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 + LdsOffsetMetadata: 13312 + LdsOffsetMetadata_Blk: 78848 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 @@ -10301,15 +11506,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10323,28 +11528,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10361,33 +11566,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10403,26 +11608,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -10436,9 +11641,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -10450,7 +11655,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qHETGII6d0uSQZoEXMbPslbRwZtZ2qEIdMes7tMaKUM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT30Yb2FxBYbeBYzDmCEQ-2xgTMHV7OuGTt-uS6NPWiiK4= BufferLoad: true BufferStore: true CUCount: null @@ -10460,10 +11665,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -10493,43 +11698,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 0 - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 768 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 46080 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 46080 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 13312 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -10541,15 +11746,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] + MIWaveGroup: [1, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10563,28 +11768,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10601,13 +11806,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -10617,17 +11822,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 1 ThreadTileA: 8 - ThreadTileB: 3 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10643,7 +11848,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -10653,16 +11858,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -10676,8 +11881,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -10733,7 +11938,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 LSCB: 128 @@ -10841,8 +12046,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -10973,7 +12178,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 64 @@ -11081,8 +12286,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -11213,7 +12418,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 64 @@ -11321,8 +12526,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 46 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -11399,16 +12604,16 @@ tailLoopOptA: true tailLoopOptB: false - [2, 3, 0, 1] -- - - [64, 128, 1, 1024] - - [0, 1.02] - - - [128, 7456, 1, 1024] - - [1, 49.79] +- - - [128, 7456, 1, 1024] + - [0, 49.79] - - [128, 4096, 1, 1024] - - [2, 36.1] + - [1, 36.1] - - [512, 2011, 1, 1024] - - [3, 52.72] + - [2, 52.72] - - [256, 7968, 1, 1024] - - [4, 65.32] + - [3, 65.32] + - - [384, 17711, 1, 246] + - [4, 58.97] - - [384, 17711, 1, 768] - [5, 80.41] - - [128, 17711, 6, 128] @@ -11427,74 +12632,82 @@ - [12, 36.46] - - [2048, 160, 1, 4096] - [13, 44.82] + - - [2048, 82, 1, 4096] + - [14, 28.4] + - - [64, 128, 1, 1024] + - [15, 1.02] - - [80, 512, 1, 4096] - - [14, 12.45] + - [16, 12.45] - - [256, 72, 1, 4096] - - [15, 6.24] + - [17, 6.24] - - [96, 512, 1, 4096] - - [16, 14.97] + - [18, 14.97] - - [256, 180, 1, 4096] - - [17, 13.36] + - [19, 13.36] - - [256, 28, 1, 4096] - - [18, 2.75] - - - [72, 320, 1, 4096] - - [19, 7.69] + - [20, 2.75] - - [64, 512, 1, 4096] - - [20, 10.53] + - [21, 10.53] - - [256, 256, 1, 4096] - - [21, 18.23] + - [22, 18.23] - - [28, 320, 1, 4096] - - [22, 3.3] + - [23, 3.3] - - [256, 116, 1, 4096] - - [23, 9.6] + - [24, 9.6] - - [512, 160, 1, 4096] - - [24, 19.51] - - - [72, 256, 1, 4096] - - [19, 6.35] + - [25, 19.51] - - [160, 512, 1, 4096] - - [25, 20.26] + - [26, 20.26] - - [28, 256, 1, 4096] - - [26, 2.75] + - [27, 2.75] - - [16, 128, 8192, 33] - - [27, 6.95] + - [28, 6.95] - - [40, 128, 8192, 64] - - [28, 17.71] + - [29, 17.71] + - - [200, 64, 4096, 32] + - [30, 14.33] + - - [128, 2440, 1, 8192] + - [31, 54.86] - - [128, 5120, 1, 8192] - - [29, 68.21] + - [32, 68.21] - - [128, 5640, 1, 8192] - - [30, 70.77] + - [33, 70.77] - - [256, 2048, 1, 8192] - - [31, 72.57] + - [34, 72.57] - - [128, 2480, 1, 17711] - - [32, 67.94] + - [35, 67.94] - - [128, 960, 1, 17711] - - [33, 52.82] + - [36, 52.82] - - [4352, 128, 1, 8192] - - [34, 65.33] + - [37, 65.33] - - [5120, 128, 1, 8192] - - [35, 66.51] + - [38, 66.51] - - [5640, 128, 1, 8192] - - [36, 68.8] + - [39, 68.8] - - [6912, 128, 1, 8192] - - [37, 71.97] + - [40, 71.97] - - [2732, 384, 1, 17711] - - [38, 93.64] - - - [887, 256, 1, 17711] - - [39, 65.02] + - [41, 93.64] - - [928, 128, 1, 17711] - - [40, 50.17] + - [42, 50.17] - - [64, 128, 1, 8192] - - [41, 5.16] + - [43, 5.16] + - - [120, 256, 1, 8192] + - [44, 15.85] - - [128, 128, 1, 8192] - - [42, 9.41] + - [45, 9.41] - - [256, 528, 1, 8192] - - [43, 37.1] + - [46, 37.1] + - - [528, 256, 1, 8192] + - [47, 36.39] + - - [57, 32, 1, 262144] + - [48, 14.94] - - [192, 112, 1, 655360] - - [44, 61.51] + - [49, 61.51] - - [224, 64, 1, 527553] - - [45, 49.28] + - [50, 49.28] - - [224, 64, 1, 752863] - - [46, 52.03] + - [51, 52.03] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index 21facc71fb2..c44ee7e3e4b 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -78,244 +78,7 @@ UseScaleAB: '' UseScaleAlphaVec: 1 UseScaleCD: false -- - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AdaptiveGemm: 0 - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3I0wGX9OCQytvVFG093CjdC3Fh11WJS7aHB6fbVAR3uo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 - LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 +- - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -477,7 +240,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 1 + SolutionIndex: 0 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -714,7 +477,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 2 + SolutionIndex: 1 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -951,7 +714,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 3 + SolutionIndex: 2 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -1188,7 +951,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 4 + SolutionIndex: 3 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -1425,7 +1188,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 5 + SolutionIndex: 4 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -1662,7 +1425,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 6 + SolutionIndex: 5 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -1749,20 +1512,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x17euTeWN6jMQ7FVMh1Ll2e-mPEhVuyHHM5E6MJDowG_k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16GRSbzQx7ighSVLVcaqdzxzzKPcqoRCQBgH1GHRJJ7XM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -1779,7 +1542,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1791,45 +1554,45 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 1 + LSPB: 32 + LVCA: 256 + LVCB: 8 + LVPA: 1 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -1840,14 +1603,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1861,28 +1624,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 1 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1891,7 +1654,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1899,22 +1662,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -1922,16 +1685,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1943,33 +1706,33 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 48 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -1986,20 +1749,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x19rPAmQ_VqFXH145RBghSm4KBn8pkGE6YhVYK32za2qc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI160W_L8MAdvSH6iFJ8r3bD7tGc-qFAzcgrXURTRj3-VzI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2016,7 +1779,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2028,45 +1791,45 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC6_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 256 + LSCB: 32 + LSPA: 1 + LSPB: 32 + LVCA: 256 + LVCB: 8 + LVPA: 1 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -2077,14 +1840,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2098,28 +1861,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 + NonTemporalA: 2 + NonTemporalB: 1 NonTemporalC: 6 - NonTemporalD: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2128,7 +1891,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2136,18 +1899,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC6_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -2159,16 +1922,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2180,29 +1943,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -2211,7 +1974,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -2223,20 +1986,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x10ZIxmIfFOLgg0guqZLCypFSHKQGFlah6vEK6DSCcuO0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x1UDSP19TYSaBbDZT8LdxllcxelGKRB4v4VgMVknXSJ8Q= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2247,7 +2010,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -2255,7 +2018,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2265,43 +2028,43 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 LSCB: 64 - LSPA: 64 + LSPA: 8 LSPB: 16 - LVCA: 4 + LVCA: 32 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 31744 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 13312 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 13312 - LdsOffsetB_Blk: 46080 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31744 - LdsOffsetMetadata_Blk: 46080 - LdsPadA: 16 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -2313,14 +2076,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 48 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -2335,27 +2098,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 2 NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalC: 5 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -2373,33 +2136,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2418,15 +2181,15 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -2442,13 +2205,13 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -2460,7 +2223,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16GRSbzQx7ighSVLVcaqdzxzzKPcqoRCQBgH1GHRJJ7XM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI322Jt2ItX6xZZLSMKHsF4w0CbdGttb9SNJ6Nhz0_HK7WU= BufferLoad: true BufferStore: true CUCount: null @@ -2484,7 +2247,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -2492,7 +2255,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2502,36 +2265,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 128 LSCB: 32 - LSPA: 1 + LSPA: 4 LSPB: 32 - LVCA: 256 + LVCA: 64 LVCB: 8 - LVPA: 1 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -2539,35 +2302,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 128 MacroTile1: 256 - MacroTileA: 256 + MacroTileA: 128 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -2578,21 +2341,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 1 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 0 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -2602,7 +2365,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2610,8 +2373,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC2_WGMXCCGn1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -2619,24 +2382,24 @@ StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2655,14 +2418,14 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 + WorkGroupMapping: 1 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -2679,8 +2442,8 @@ _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -2697,7 +2460,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI160W_L8MAdvSH6iFJ8r3bD7tGc-qFAzcgrXURTRj3-VzI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16wN6A5rnXqv-rHoqnpsJkJScWXV4931PtcBVxS-sQP-A= BufferLoad: true BufferStore: true CUCount: null @@ -2739,7 +2502,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 256 LSCB: 32 @@ -2816,9 +2579,9 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 @@ -2847,22 +2610,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -2900,7 +2663,7 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -2913,7 +2676,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -2924,7 +2687,7 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -2934,7 +2697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT144x256x32_MI16zGLdUFppgmxwRtFeiiI5CTCUno1nocQnuy352ZtGY3g= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xHp8Jp1EN-BsW4NNHKXj8wdhUGyxjiO0ZBU36jrCBic4= BufferLoad: true BufferStore: true CUCount: null @@ -2944,7 +2707,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -2957,9 +2720,10 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -2973,48 +2737,48 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 LSCA: 16 - LSCB: 32 + LSCB: 128 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 128 + LVPB: 2 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 + LdsBytesNoAmax: 60416 LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 60416 + LdsNumElementsAlignedA: 10240 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 43008 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 8 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 43008 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3024,15 +2788,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [9, 4] - MIWaveTileA: 9 - MIWaveTileB: 4 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 144 - MacroTile1: 256 - MacroTileA: 144 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3053,21 +2817,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 18 - NumLoadsB: 32 - NumLoadsCoalescedA: 9 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3076,7 +2840,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3084,84 +2848,86 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 4 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 36 - ThreadTile1: 4 - ThreadTileA: 36 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -3171,20 +2937,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x1UDSP19TYSaBbDZT8LdxllcxelGKRB4v4VgMVknXSJ8Q= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16pW7zOSW31Upy1wP-ePzJYYSpctNFPpPLDnJHKj_nfEw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3194,64 +2960,65 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3261,15 +3028,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3283,28 +3050,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 4 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3313,7 +3080,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3321,8 +3088,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -3331,59 +3098,60 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -3392,13 +3160,14 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -3408,7 +3177,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI322Jt2ItX6xZZLSMKHsF4w0CbdGttb9SNJ6Nhz0_HK7WU= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16VBp-QCoHOpL6LyENWcMSpXtCN0fwPKVG3tklVSFxaog= BufferLoad: true BufferStore: true CUCount: null @@ -3431,6 +3200,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 @@ -3447,11 +3217,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 128 LSCB: 32 LSPA: 4 @@ -3461,25 +3231,25 @@ LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 16896 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 17408 LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -3487,21 +3257,21 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 128 MacroTile1: 256 @@ -3512,10 +3282,10 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -3526,14 +3296,14 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 10 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 @@ -3550,7 +3320,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3558,33 +3328,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 4 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3594,24 +3364,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3620,22 +3391,23 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -3645,7 +3417,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16wN6A5rnXqv-rHoqnpsJkJScWXV4931PtcBVxS-sQP-A= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xP3aS4yEsKaJ_WgMEwdb05dsMR2_LceuYOMf1OYI2fWk= BufferLoad: true BufferStore: true CUCount: null @@ -3668,8 +3440,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -3677,44 +3450,44 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 256 + LSCA: 64 LSCB: 32 - LSPA: 1 + LSPA: 16 LSPB: 32 - LVCA: 256 + LVCA: 16 LVCB: 8 - LVPA: 1 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3735,14 +3508,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 64 MacroTile1: 256 - MacroTileA: 256 + MacroTileA: 64 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -3763,21 +3536,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 1 NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalC: 7 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -3795,33 +3568,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3831,7 +3604,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -3846,9 +3620,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3861,15 +3635,16 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -3882,20 +3657,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xHp8Jp1EN-BsW4NNHKXj8wdhUGyxjiO0ZBU36jrCBic4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16wI1VAKWPNdP9RHMqPy0EQsFIoAsqd-rof14TfmS3B7w= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3907,15 +3682,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -3925,47 +3700,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60416 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 60416 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 43008 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 43008 - LdsPadA: 16 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -3973,15 +3748,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3995,28 +3770,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4033,33 +3808,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4078,25 +3853,25 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -4108,8 +3883,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -4122,20 +3897,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xMybmPilfmjdxVF3hhgbtCdONGixNfiKnsBpv98ldbF8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32UbnKmTyiavN2akJm6VmZtXlrWfhYQJP2E3bwh8fwpA8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4147,15 +3922,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -4165,98 +3940,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4265,7 +4040,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4273,32 +4048,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -4315,41 +4090,41 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -4362,20 +4137,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1zEU2zKTgrAqpo4O1jNbgKO5gG6NuTgAwAILYGhhS0vw= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32AdmC3-P-DJd8Y5s3WQ6Z37Tt-2n6SRcuWFce7LCusPk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4387,13 +4162,13 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -4405,48 +4180,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 256 + LSCB: 32 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -4454,48 +4229,48 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -4513,39 +4288,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 64 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 64 ThreadTileB: 2 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4558,29 +4333,29 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -4588,8 +4363,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -4602,7 +4377,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16pW7zOSW31Upy1wP-ePzJYYSpctNFPpPLDnJHKj_nfEw= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32mu0WQNBr6Bcz0RHbPND4cTi3CAGuaMu12Tv6aUvNdvM= BufferLoad: true BufferStore: true CUCount: null @@ -4625,9 +4400,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -4635,7 +4410,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -4645,36 +4420,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 LSCB: 32 - LSPA: 4 + LSPA: 1 LSPB: 32 - LVCA: 64 + LVCA: 256 LVCB: 8 - LVPA: 2 + LVPA: 1 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4682,35 +4457,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4722,21 +4497,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 6 + NonTemporalB: 7 + NonTemporalC: 7 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 + NumElementsPerBatchStore: 16 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 32 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4745,7 +4520,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4753,22 +4528,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -4776,10 +4551,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4795,19 +4570,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4816,16 +4591,16 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 2 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -4842,7 +4617,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16VBp-QCoHOpL6LyENWcMSpXtCN0fwPKVG3tklVSFxaog= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI327VropPoEGPK4p81XGOPYK7cGqJ9bMn-cBbte0oLvQa0= BufferLoad: true BufferStore: true CUCount: null @@ -4865,9 +4640,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -4875,7 +4650,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -4885,36 +4660,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 LSCB: 32 - LSPA: 4 + LSPA: 1 LSPB: 32 - LVCA: 64 + LVCA: 256 LVCB: 8 - LVPA: 2 + LVPA: 1 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4922,11 +4697,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -4934,23 +4709,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 8] + MIWaveTile: [4, 2] MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4961,22 +4736,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 2 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 4 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 16 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 32 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4985,7 +4760,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4993,33 +4768,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5035,19 +4810,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5056,21 +4831,21 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 2 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -5082,7 +4857,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xAu1FYXTFu65OW_QnFxWNJ9o3fDj1dpn7VH7-NENuD30= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x64_MI16xjyuL3eEnd6oM60MxKkSoj12f2_Rm9Y651748TEsy1cM= BufferLoad: true BufferStore: true CUCount: null @@ -5092,22 +4867,22 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -5115,7 +4890,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -5125,47 +4900,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 69120 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 69120 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 144896 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 69120 + LdsOffsetMetadata_Blk: 144896 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5173,15 +4948,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 48 + MacroTile1: 192 + MacroTileA: 48 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5195,28 +4970,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5224,8 +4999,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5233,39 +5008,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -5285,16 +5060,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5308,8 +5083,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -5322,17 +5097,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xP3aS4yEsKaJ_WgMEwdb05dsMR2_LceuYOMf1OYI2fWk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x12XhFbXZExtNclf8zD5hKevb_buILS5setEE10T0PVPQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -5348,12 +5123,12 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -5365,35 +5140,35 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 64 LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPB: 2 + LVCA: 4 + LVCB: 32 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 512 + LVPB: 1 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 13824 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 13824 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 8 + LdsOffsetMetadata: 13824 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -5402,10 +5177,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5413,15 +5188,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5441,23 +5216,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalA: 2 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -5465,7 +5240,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5473,9 +5248,9 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -5483,29 +5258,29 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -5515,28 +5290,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -5550,7 +5325,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -5562,7 +5337,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16wI1VAKWPNdP9RHMqPy0EQsFIoAsqd-rof14TfmS3B7w= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x32_MI32xRAh7ppQBNzWRsmJpnL4jlXiT-M-iht9O9UTSpXb7f7s= BufferLoad: true BufferStore: true CUCount: null @@ -5572,10 +5347,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5587,15 +5362,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -5605,98 +5380,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 35840 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 35840 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 73728 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 8] - MIWaveTileA: 2 - MIWaveTileB: 8 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 5 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5713,33 +5488,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5758,8 +5533,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -5771,25 +5546,25 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -5802,7 +5577,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32UbnKmTyiavN2akJm6VmZtXlrWfhYQJP2E3bwh8fwpA8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xCKUpZ9P25YvYGJ1Acu5q1cb_ng8B3vHgEyiubZYN-vs= BufferLoad: true BufferStore: true CUCount: null @@ -5812,7 +5587,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -5833,7 +5608,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -5845,36 +5620,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -5882,35 +5657,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -5921,22 +5696,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 5 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5945,7 +5720,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5953,18 +5728,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -5976,16 +5751,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 2 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -5995,26 +5770,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -6023,8 +5798,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -6042,20 +5817,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32AdmC3-P-DJd8Y5s3WQ6Z37Tt-2n6SRcuWFce7LCusPk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xQ9WCQXg3kci4s1k-N1_jON3rjIc87HNykcNj6r53DuQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6067,13 +5842,13 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -6085,97 +5860,97 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 256 - LSCB: 32 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 98816 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 98816 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 16 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -6193,39 +5968,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6235,32 +6010,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -6268,8 +6043,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -6282,20 +6057,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32mu0WQNBr6Bcz0RHbPND4cTi3CAGuaMu12Tv6aUvNdvM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3t8AEDJh-qtEP0j1P4XwXYzISUBJp3m20CgLweU5bvQ0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6308,12 +6083,12 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6325,45 +6100,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 256 - LSCB: 32 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 512 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 49664 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -6373,15 +6148,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6395,28 +6170,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 7 + NonTemporalA: 4 + NonTemporalB: 6 + NonTemporalC: 4 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 4 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6425,7 +6200,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6433,39 +6208,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC16_WGMXCCGn1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6478,23 +6253,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6508,9 +6283,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -6522,7 +6297,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI327VropPoEGPK4p81XGOPYK7cGqJ9bMn-cBbte0oLvQa0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16XOuZu64IcidHMClamzevEn0u2nxp4pZshzhk7HOLl9k= BufferLoad: true BufferStore: true CUCount: null @@ -6542,12 +6317,12 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -6555,7 +6330,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -6565,36 +6340,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 128 LSCB: 32 - LSPA: 1 + LSPA: 8 LSPB: 32 - LVCA: 256 + LVCA: 32 LVCB: 8 - LVPA: 1 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -6602,11 +6377,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -6614,23 +6389,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 256 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -6641,21 +6416,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 + NonTemporalA: 2 + NonTemporalB: 3 NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -6664,8 +6439,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6673,33 +6448,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6719,15 +6494,15 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6736,15 +6511,15 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -6762,17 +6537,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x64_MI16xjyuL3eEnd6oM60MxKkSoj12f2_Rm9Y651748TEsy1cM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x256x32_MI16x8aOTHAb3oJ24lDR1GY5QlgCdE9ISN9B4sNQoCcICvs4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -6782,12 +6557,12 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -6795,7 +6570,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -6805,34 +6580,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 16 - LSCB: 64 - LSPA: 64 - LSPB: 16 - LVCA: 4 - LVCB: 16 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 16 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 69120 + LdsBytesNoAmax: 47616 LdsInitCVgprs: false - LdsNumBytes: 69120 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 144896 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 69120 - LdsOffsetMetadata_Blk: 144896 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 78336 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -6842,10 +6617,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -6854,14 +6629,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 192 - MacroTileA: 48 - MacroTileB: 192 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6883,20 +6658,20 @@ NonTemporal: -1 NonTemporalA: 1 NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 5 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6904,8 +6679,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6913,14 +6688,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 @@ -6936,10 +6711,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6955,11 +6730,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -6971,16 +6746,16 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -6990,7 +6765,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -7002,17 +6777,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x12XhFbXZExtNclf8zD5hKevb_buILS5setEE10T0PVPQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x64x32_MI16xZMetFWyGzjhHcbZJekE_6F7yBfklY-t1A7lJBhtt1oY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -7028,7 +6803,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -7045,34 +6820,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 0 - LSCA: 16 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 2 - LVCA: 4 - LVCB: 32 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 256 + LVPB: 8 + LdsBlockSizePerPadA: 3072 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13824 + LdsBytesNoAmax: 99328 LdsInitCVgprs: false - LdsNumBytes: 13824 - LdsNumElementsAlignedA: 5120 + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 25088 LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25088 + LdsOffsetB_Blk: 90624 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13824 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 90624 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 @@ -7082,10 +6857,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7093,15 +6868,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7121,23 +6896,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 6 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -7153,34 +6928,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 + StreamKXCCMapping: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 @@ -7195,42 +6970,42 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -7242,7 +7017,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x32_MI32xRAh7ppQBNzWRsmJpnL4jlXiT-M-iht9O9UTSpXb7f7s= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xSDxhcZ3u4iSYJMHirZW1CTHGblCpuWSDzLc_0f9fOAo= BufferLoad: true BufferStore: true CUCount: null @@ -7252,10 +7027,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7267,7 +7042,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -7275,7 +7050,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -7285,34 +7060,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35840 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 35840 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35840 - LdsOffsetMetadata_Blk: 73728 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -7320,10 +7095,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -7333,15 +7108,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7355,28 +7130,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 2 NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7393,13 +7168,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -7409,17 +7184,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 16 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7435,41 +7210,41 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -7482,7 +7257,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xCKUpZ9P25YvYGJ1Acu5q1cb_ng8B3vHgEyiubZYN-vs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16vi3ZztTU2ZooOHejWDnWTKb_gBIj8TaDNK9DtMbhZNs= BufferLoad: true BufferStore: true CUCount: null @@ -7505,9 +7280,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -7515,7 +7290,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -7525,34 +7300,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 256 LSCB: 32 - LSPA: 16 + LSPA: 1 LSPB: 32 - LVCA: 16 + LVCA: 256 LVCB: 8 - LVPA: 4 + LVPA: 1 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7565,7 +7340,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7573,15 +7348,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7601,22 +7376,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 1 NonTemporalB: 7 NonTemporalC: 5 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7633,33 +7408,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7675,19 +7450,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7696,7 +7471,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -7705,7 +7480,7 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 - numSubTiles: 1 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -7722,7 +7497,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xQ9WCQXg3kci4s1k-N1_jON3rjIc87HNykcNj6r53DuQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3Yo9YdRBKBj79FhGiWwDokhFsmBdvtYGmzRh04XflZAM= BufferLoad: true BufferStore: true CUCount: null @@ -7732,7 +7507,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -7765,72 +7540,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 16 - LSCB: 256 - LSPA: 64 - LSPB: 4 - LVCA: 4 - LVCB: 64 - LVPA: 16 - LVPB: 1 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 49280 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 49280 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -7841,22 +7616,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 1 NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 4 + NonTemporalC: 6 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7873,32 +7648,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -7915,7 +7690,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -7925,16 +7700,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -7943,14 +7718,14 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -7962,20 +7737,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3t8AEDJh-qtEP0j1P4XwXYzISUBJp3m20CgLweU5bvQ0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x160x32_MI16xPdajT8YH9ob0TRWzD1ldbfXmsBG0Yb-qwhjBqEUffYg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7987,15 +7762,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -8005,98 +7780,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LVPB: 8 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 78336 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 5 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8113,33 +7888,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 33 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 + StreamKXCCMapping: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8165,16 +7940,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -8183,13 +7958,13 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -8202,7 +7977,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16XOuZu64IcidHMClamzevEn0u2nxp4pZshzhk7HOLl9k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16D6x4dco45Qa1J1WWaOfvrVhnUVbkL6MHiGgtcXWPe0g= BufferLoad: true BufferStore: true CUCount: null @@ -8233,7 +8008,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -8245,7 +8020,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 LSCB: 32 @@ -8255,25 +8030,25 @@ LVCB: 8 LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -8293,10 +8068,10 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 128 MacroTile1: 128 @@ -8321,16 +8096,16 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 16 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -8353,39 +8128,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 34 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -8398,14 +8173,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 @@ -8442,7 +8217,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x256x32_MI16x8aOTHAb3oJ24lDR1GY5QlgCdE9ISN9B4sNQoCcICvs4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16g49CyeI2eVWDPXKcFHBWWzGtiVFxp9FPEDK9D-TTnuI= BufferLoad: true BufferStore: true CUCount: null @@ -8467,7 +8242,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -8475,7 +8250,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -8485,35 +8260,35 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 16 + LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 2560 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 47616 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 47616 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 20992 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 78336 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 86528 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 47616 - LdsOffsetMetadata_Blk: 78336 - LdsPadA: 8 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 86528 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -8533,15 +8308,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] + MIWaveGroup: [2, 2] MIWaveTile: [5, 4] MIWaveTileA: 5 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 256 - MacroTileA: 80 - MacroTileB: 256 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8562,21 +8337,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 80 NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 - NumLoadsB: 8 + NumLoadsA: 5 + NumLoadsB: 4 NumLoadsCoalescedA: 5 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8593,26 +8368,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 35 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 4 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -8620,12 +8395,12 @@ ThreadTile1: 4 ThreadTileA: 20 ThreadTileB: 4 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -8635,7 +8410,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -8645,9 +8420,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8656,11 +8431,11 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -8670,7 +8445,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -8682,7 +8457,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x64x32_MI16xZMetFWyGzjhHcbZJekE_6F7yBfklY-t1A7lJBhtt1oY= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1A-I78gXXrdOMKV_Cgj6SLbZMW0uS0cuPNCPj32U4D4A= BufferLoad: true BufferStore: true CUCount: null @@ -8702,7 +8477,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -8725,34 +8500,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 16 + LSPB: 16 + LVCA: 8 LVCB: 8 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 3072 - LdsBlockSizePerPadB: 512 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 9216 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 25088 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 9216 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25088 - LdsOffsetB_Blk: 90624 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 90624 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 20992 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 @@ -8773,15 +8548,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 64 - MacroTileA: 192 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8801,31 +8576,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalA: 6 + NonTemporalB: 4 NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8833,33 +8608,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 36 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 4 - ThreadTileA: 12 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8875,19 +8650,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8896,21 +8671,21 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -8922,7 +8697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x192x32_MI16idrImgWgCO7nvcJQBWTDXiMZm2WoqSF8tVNvZiYJ_9I= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1jVgAbVl7TAJozgubbJrw62tnMG_eow8t5K7k9PPC4WA= BufferLoad: true BufferStore: true CUCount: null @@ -8945,7 +8720,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8965,34 +8740,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 3072 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 8704 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -9005,7 +8780,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -9013,15 +8788,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 6] - MIWaveTileA: 6 - MIWaveTileB: 6 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9041,23 +8816,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 7 NonTemporalB: 2 NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9065,7 +8840,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9073,8 +8848,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 37 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -9083,23 +8858,23 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 6 - ThreadTileA: 24 - ThreadTileB: 6 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9115,7 +8890,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -9125,9 +8900,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9145,12 +8920,12 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -9162,20 +8937,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xSDxhcZ3u4iSYJMHirZW1CTHGblCpuWSDzLc_0f9fOAo= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1TJ8fhKzIV0bLTU9VMa_LELZqPo5QayYI3rGq2ReOzWM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9187,15 +8962,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -9205,98 +8980,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 14848 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 14848 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 10240 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 14848 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9313,39 +9088,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 38 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -9355,28 +9130,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9388,9 +9163,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -9402,7 +9177,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16vi3ZztTU2ZooOHejWDnWTKb_gBIj8TaDNK9DtMbhZNs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x32_MI16x1hlDZ2RQXNBkW0XTWcWRLJPdhG2QWccTGpKZ8d7Cm7gs= BufferLoad: true BufferStore: true CUCount: null @@ -9425,7 +9200,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -9433,7 +9208,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9445,35 +9220,35 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: 0 - LSCA: 256 + LSCA: 32 LSCB: 32 - LSPA: 1 - LSPB: 32 - LVCA: 256 + LSPA: 4 + LSPB: 16 + LVCA: 32 LVCB: 8 - LVPA: 1 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 19456 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 19456 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 36864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 36864 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -9485,7 +9260,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -9493,15 +9268,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9521,23 +9296,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9553,39 +9328,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -9598,17 +9373,17 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] _DepthU: 32 @@ -9616,7 +9391,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9625,7 +9400,7 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -9642,7 +9417,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3Yo9YdRBKBj79FhGiWwDokhFsmBdvtYGmzRh04XflZAM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x32_MI16xiJ6qFagB646z5Vtl5Hvm-aiCfDXZyxNPc89xWsDkpZY= BufferLoad: true BufferStore: true CUCount: null @@ -9654,7 +9429,7 @@ DebugStreamK: 0 DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false @@ -9667,7 +9442,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -9675,7 +9450,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -9685,98 +9460,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 16 LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 4 + LVPA: 16 LVPB: 8 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 768 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49280 + LdsBytesNoAmax: 64768 LdsInitCVgprs: false - LdsNumBytes: 49280 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 64768 + LdsNumElementsAlignedA: 6656 + LdsNumElementsAlignedB: 25344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 6656 + LdsOffsetB_Blk: 39424 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 6656 + LdsOffsetMetadata_Blk: 39424 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 48 + MacroTile1: 192 + MacroTileA: 48 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 + NonTemporalA: 0 + NonTemporalB: 5 NonTemporalC: 6 NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9785,7 +9560,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9793,14 +9568,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 @@ -9816,10 +9591,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9835,7 +9610,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -9845,9 +9620,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9856,19 +9631,19 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false @@ -9882,7 +9657,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x160x32_MI16xPdajT8YH9ob0TRWzD1ldbfXmsBG0Yb-qwhjBqEUffYg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x384x32_MI16xxRLiYtk8vs0qL-PXaqBrMaldMaCGmntAX1C6Jqy-srA= BufferLoad: true BufferStore: true CUCount: null @@ -9902,12 +9677,12 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -9915,7 +9690,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -9925,35 +9700,35 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 16 LVPB: 8 - LdsBlockSizePerPadA: 1536 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38400 + LdsBytesNoAmax: 62976 LdsInitCVgprs: false - LdsNumBytes: 38400 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 62976 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 78336 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 73216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38400 - LdsOffsetMetadata_Blk: 78336 - LdsPadA: 16 + LdsOffsetMetadata: 62976 + LdsOffsetMetadata_Blk: 73216 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -9965,7 +9740,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -9973,15 +9748,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 5] + MIWaveGroup: [1, 4] + MIWaveTile: [3, 6] MIWaveTileA: 3 - MIWaveTileB: 5 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 160 - MacroTileA: 96 - MacroTileB: 160 + MacroTile0: 48 + MacroTile1: 384 + MacroTileA: 48 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10001,22 +9776,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 3 - NumLoadsB: 5 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 12 NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10024,7 +9799,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -10033,8 +9808,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -10043,29 +9818,29 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 4 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 12 - ThreadTile1: 5 + ThreadTile1: 6 ThreadTileA: 12 - ThreadTileB: 5 - TransposeLDS: 1 + ThreadTileB: 6 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10075,19 +9850,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10096,7 +9871,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10110,9 +9885,9 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -10122,27 +9897,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16D6x4dco45Qa1J1WWaOfvrVhnUVbkL6MHiGgtcXWPe0g= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3tgew_wmZ1d78svsdgjmhnAmiMolGNWzJ-VQ5Kmkq7s0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -10153,7 +9928,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -10165,24 +9940,24 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 98560 LdsInitCVgprs: false - LdsNumBytes: 33792 + LdsNumBytes: 98560 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -10191,66 +9966,66 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 + LdsOffsetMetadata: 16384 LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 8] - MIWaveTileA: 2 - MIWaveTileB: 8 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -10258,14 +10033,16 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: 4 + NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10273,33 +10050,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10313,46 +10090,49 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -10362,17 +10142,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16g49CyeI2eVWDPXKcFHBWWzGtiVFxp9FPEDK9D-TTnuI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xRSQp-nzPx9YJcjSE6rpJqJMr777j07qBoTYEz5pwMuc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -10382,7 +10162,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -10405,36 +10185,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 2560 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38400 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 38400 - LdsNumElementsAlignedA: 20992 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 20992 - LdsOffsetB_Blk: 86528 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38400 - LdsOffsetMetadata_Blk: 86528 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -10442,11 +10222,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -10454,23 +10234,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 160 + MacroTileA: 64 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -10481,31 +10261,33 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 - NumLoadsCoalescedA: 5 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10513,33 +10295,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10553,28 +10335,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -10592,7 +10377,7 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -10602,7 +10387,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1A-I78gXXrdOMKV_Cgj6SLbZMW0uS0cuPNCPj32U4D4A= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32JxH2KKlBlMybG2dSQRlSPdHkRx_8g5fo45X9py5Jl3I= BufferLoad: true BufferStore: true CUCount: null @@ -10612,7 +10397,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -10622,7 +10407,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -10633,7 +10418,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -10645,36 +10430,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 16 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9216 + LdsBytesNoAmax: 68096 LdsInitCVgprs: false - LdsNumBytes: 9216 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 68096 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 68096 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -10682,35 +10467,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -10721,31 +10506,33 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 4 - NonTemporalC: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10753,39 +10540,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 1 + ThreadTileA: 64 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10793,38 +10580,41 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -10832,7 +10622,7 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -10842,17 +10632,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1jVgAbVl7TAJozgubbJrw62tnMG_eow8t5K7k9PPC4WA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x64_MI16x08S02Y53B0Ne6ocNhqpSHhbrCU_jARBa0pnTDvEPOy4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -10867,7 +10657,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -10885,35 +10675,35 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 8704 + LdsBytesNoAmax: 76288 LdsInitCVgprs: false - LdsNumBytes: 8704 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 76288 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 20480 - LdsPadA: 0 + LdsOffsetMetadata: 76288 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -10922,8 +10712,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] @@ -10933,15 +10723,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 256 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10961,23 +10751,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalA: 4 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -10993,39 +10785,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11033,48 +10825,51 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11082,141 +10877,141 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1TJ8fhKzIV0bLTU9VMa_LELZqPo5QayYI3rGq2ReOzWM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 32 + LSCA: 128 LSCB: 32 LSPA: 8 LSPB: 32 LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14848 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 14848 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14848 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11233,19 +11028,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 46 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -11255,17 +11050,18 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11279,42 +11075,42 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11322,142 +11118,143 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x32_MI16x1hlDZ2RQXNBkW0XTWcWRLJPdhG2QWccTGpKZ8d7Cm7gs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6JGSVgpt4GrxgPWh0ngWHRKLbLQ_tqOXKZqa8Lb0Ms-k= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 19456 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 19456 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 36864 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 19456 - LdsOffsetMetadata_Blk: 36864 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 3] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 96 - MacroTileA: 32 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -11465,7 +11262,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11473,33 +11270,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 47 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11515,7 +11313,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -11525,36 +11323,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11562,7 +11360,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x32_MI16xiJ6qFagB646z5Vtl5Hvm-aiCfDXZyxNPc89xWsDkpZY= BufferLoad: true BufferStore: true CUCount: null @@ -11572,131 +11369,132 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 768 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 64768 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 64768 - LdsNumElementsAlignedA: 6656 - LdsNumElementsAlignedB: 25344 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 6656 - LdsOffsetB_Blk: 39424 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 6656 - LdsOffsetMetadata_Blk: 39424 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 192 - MacroTileA: 48 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11705,7 +11503,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11713,33 +11511,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 48 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 2 SubGroup1: 64 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11758,43 +11557,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11802,141 +11601,142 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x384x32_MI16xxRLiYtk8vs0qL-PXaqBrMaldMaCGmntAX1C6Jqy-srA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1tZVQklGtKOQ3p4IgWyAMn9jTY4tlNPIDxC3Y71614zM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 16 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62976 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 62976 - LdsNumElementsAlignedA: 7680 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 7680 - LdsOffsetB_Blk: 73216 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 62976 - LdsOffsetMetadata_Blk: 73216 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 6] - MIWaveTileA: 3 - MIWaveTileB: 6 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 384 - MacroTileA: 48 - MacroTileB: 384 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11944,8 +11744,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11953,19 +11753,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 49 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -11975,17 +11775,18 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 6 - ThreadTileA: 12 - ThreadTileB: 6 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11998,43 +11799,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12042,7 +11843,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3tgew_wmZ1d78svsdgjmhnAmiMolGNWzJ-VQ5Kmkq7s0= BufferLoad: true BufferStore: true CUCount: null @@ -12052,27 +11852,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -12082,37 +11882,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98560 + LdsBytesNoAmax: 123904 LdsInitCVgprs: false - LdsNumBytes: 98560 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -12120,12 +11920,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12134,14 +11934,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12153,33 +11953,32 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12195,37 +11994,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 50 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -12235,51 +12035,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12287,7 +12084,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xRSQp-nzPx9YJcjSE6rpJqJMr777j07qBoTYEz5pwMuc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19w6888cdHAWM4y5NYJiddGc0xmSYSG1iOCD6RgfOFHM= BufferLoad: true BufferStore: true CUCount: null @@ -12297,28 +12094,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -12327,37 +12124,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50176 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50176 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -12366,11 +12163,11 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12378,15 +12175,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12398,7 +12195,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -12408,30 +12205,29 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -12440,33 +12236,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 51 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12480,51 +12277,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12532,37 +12326,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32JxH2KKlBlMybG2dSQRlSPdHkRx_8g5fo45X9py5Jl3I= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2ej76CXhxc1HCjfC8xyOXAnhi0iATAwRTzc4u3zxpLfA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false @@ -12572,50 +12366,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 68096 + LdsBytesNoAmax: 148736 LdsInitCVgprs: false - LdsNumBytes: 68096 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 148736 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 41600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetA_Blk: 74368 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 107136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 68096 - LdsOffsetMetadata_Blk: 164352 - LdsPadA: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 107136 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12623,15 +12417,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12643,9 +12437,9 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -12653,23 +12447,22 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 80 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 10 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12685,39 +12478,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 52 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSwapAddr: true + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 64 - ThreadTile1: 1 + ThreadTile1: 5 ThreadTileA: 64 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTileB: 5 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -12725,12 +12519,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -12740,36 +12531,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12777,37 +12568,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32pZ9lLvhCdmH7RpWQsDwDXdHIV2y4SsFgkBGs2DezxyQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2l9pCGbOtMUbslloqKGQiaMAYUl03KI_tqPnmAtre-YA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -12817,37 +12608,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 130560 LdsInitCVgprs: false - LdsNumBytes: 66560 + LdsNumBytes: 130560 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 + LdsOffsetA_Blk: 65536 LdsOffsetB: 32768 - LdsOffsetB_Blk: 163840 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 163840 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -12857,10 +12648,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12868,15 +12659,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 7] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12888,7 +12679,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -12898,23 +12689,22 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 7 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12930,33 +12720,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 53 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 7 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12970,51 +12761,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13022,7 +12810,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x64_MI16x08S02Y53B0Ne6ocNhqpSHhbrCU_jARBa0pnTDvEPOy4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2bO0CyY1hfbzTfbroYwjGkYL-2OnycNC5Ws1vnO03-EM= BufferLoad: true BufferStore: true CUCount: null @@ -13032,28 +12820,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -13062,104 +12850,103 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 76288 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 76288 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 76288 - LdsOffsetMetadata_Blk: 139776 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 256 MacroTile1: 256 - MacroTileA: 32 + MacroTileA: 256 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13175,19 +12962,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 54 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -13197,17 +12984,18 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 64 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 64 ThreadTileB: 4 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13215,46 +13003,43 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -13267,7 +13052,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6JGSVgpt4GrxgPWh0ngWHRKLbLQ_tqOXKZqa8Lb0Ms-k= BufferLoad: true BufferStore: true CUCount: null @@ -13279,7 +13063,7 @@ DebugStreamK: 0 DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -13310,8 +13094,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 64 LSCB: 32 LSPA: 16 @@ -13323,21 +13107,21 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57600 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 57600 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -13345,7 +13129,7 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 @@ -13359,14 +13143,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13380,7 +13164,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -13393,15 +13177,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 12 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -13419,8 +13203,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 55 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -13428,7 +13212,7 @@ StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 @@ -13443,10 +13227,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -13462,7 +13246,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -13473,7 +13257,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -13490,12 +13274,12 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false @@ -13509,6 +13293,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2hoMDmZeEsYCyvKLfTCHSZhNF7OrTU2H4lTaznjnrQWU= BufferLoad: true BufferStore: true CUCount: null @@ -13518,10 +13303,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -13551,43 +13336,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115200 + LdsBytesNoAmax: 124416 LdsInitCVgprs: false - LdsNumBytes: 115200 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 124416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -13599,15 +13384,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13621,7 +13406,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -13630,19 +13415,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -13660,13 +13445,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 56 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -13676,22 +13461,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 6 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -13713,16 +13498,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 8 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -13731,16 +13516,16 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -13750,7 +13535,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1tZVQklGtKOQ3p4IgWyAMn9jTY4tlNPIDxC3Y71614zM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHeN2A1e9y-sybm4-VD6Rl-mXAAhE2oR_OYb0CV2Kb8= BufferLoad: true BufferStore: true CUCount: null @@ -13762,7 +13547,7 @@ DebugStreamK: 0 DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -13793,34 +13578,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115200 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 115200 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -13828,7 +13613,7 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 @@ -13841,15 +13626,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13863,7 +13648,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -13876,15 +13661,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -13902,8 +13687,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 57 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -13911,25 +13696,25 @@ StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -13945,17 +13730,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -13978,7 +13763,7 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false @@ -13992,6 +13777,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2SSMrbopPykWoZTFVNLV97OiwO0KcDliSRSdRIxQZbhY= BufferLoad: true BufferStore: true CUCount: null @@ -14022,7 +13808,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -14034,34 +13820,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x160x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 256 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 + LVPA: 1 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123904 - LdsInitCVgprs: false - LdsNumBytes: 123904 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 33792 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 121344 + LdsInitCVgprs: false + LdsNumBytes: 121344 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -14082,15 +13868,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14117,15 +13903,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 192 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -14143,8 +13929,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 58 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x160x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -14154,27 +13940,27 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14189,15 +13975,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -14214,8 +14000,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -14233,7 +14019,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19w6888cdHAWM4y5NYJiddGc0xmSYSG1iOCD6RgfOFHM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kPOJu3SjIZ5ID-BcDpMScvXSs_-HeTA7aISuMEhdHHE= BufferLoad: true BufferStore: true CUCount: null @@ -14276,8 +14062,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 64 LSCB: 32 LSPA: 16 @@ -14355,7 +14141,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 @@ -14385,8 +14171,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 59 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -14428,7 +14214,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -14456,14 +14242,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -14475,7 +14261,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2ej76CXhxc1HCjfC8xyOXAnhi0iATAwRTzc4u3zxpLfA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1CVWcLkzUFVZguyEpkQVePxOfwkSx0oBkGlgi_Q3qok0= BufferLoad: true BufferStore: true CUCount: null @@ -14486,9 +14272,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -14506,7 +14292,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -14518,34 +14304,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 1 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 148736 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 148736 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 41600 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 74368 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 107136 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 107136 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -14553,8 +14339,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -14567,14 +14353,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 320 - MacroTileA: 256 - MacroTileB: 320 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14588,7 +14374,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -14602,14 +14388,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 320 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 8 - NumLoadsB: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -14627,8 +14413,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 60 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -14636,9 +14422,9 @@ StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -14651,14 +14437,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 5 - ThreadTileA: 64 - ThreadTileB: 5 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14673,8 +14459,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -14698,13 +14484,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -14717,7 +14503,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2l9pCGbOtMUbslloqKGQiaMAYUl03KI_tqPnmAtre-YA= BufferLoad: true BufferStore: true CUCount: null @@ -14748,7 +14533,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -14760,34 +14545,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 - LVPA: 1 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 130560 + LdsBytesNoAmax: 123904 LdsInitCVgprs: false - LdsNumBytes: 130560 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32256 + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -14808,15 +14593,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 7] - MIWaveTileA: 2 - MIWaveTileB: 7 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 224 - MacroTileA: 256 - MacroTileB: 224 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14838,20 +14623,20 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 112 - NumLoadsA: 8 - NumLoadsB: 7 - NumLoadsCoalescedA: 1 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -14869,8 +14654,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 61 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -14880,27 +14665,27 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14915,15 +14700,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -14940,8 +14725,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -14959,7 +14744,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2bO0CyY1hfbzTfbroYwjGkYL-2OnycNC5Ws1vnO03-EM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6IhMdVhMfTI3xBN4uSqbJxvOYzvtnSVMCVX0u78IWd44= BufferLoad: true BufferStore: true CUCount: null @@ -14969,7 +14754,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -14990,7 +14775,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -15002,72 +14787,72 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 256 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 132096 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 132096 + LdsNumBytes: 115200 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66048 + LdsOffsetA_Blk: 65536 LdsOffsetB: 32768 - LdsOffsetB_Blk: 98816 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -15081,19 +14866,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -15111,34 +14896,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 62 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15154,26 +14939,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -15201,6 +14986,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6k48tNKODxoHFqHvsJzLdXfHVMveYQ1hpLblmE2a0eXQ= BufferLoad: true BufferStore: true CUCount: null @@ -15210,9 +14996,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -15231,7 +15017,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -15243,77 +15029,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 148992 + LdsBytesNoAmax: 53376 LdsInitCVgprs: false - LdsNumBytes: 148992 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 49920 + LdsNumBytes: 53376 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 74496 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 99072 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 99072 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 384 - MacroTileA: 192 - MacroTileB: 384 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -15326,15 +15112,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 288 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 6 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -15352,34 +15138,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 63 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 3 - ThreadTileA: 96 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15395,26 +15181,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -15428,7 +15214,7 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false @@ -15442,7 +15228,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2hoMDmZeEsYCyvKLfTCHSZhNF7OrTU2H4lTaznjnrQWU= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3X7_irXf-O6GzVKxdiclKJFNBvIMfkuxU-ZfFdCvraBc= BufferLoad: true BufferStore: true CUCount: null @@ -15452,7 +15238,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15473,7 +15259,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -15485,36 +15271,36 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 0 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124416 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 124416 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -15523,34 +15309,34 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 6] - MIWaveTileA: 2 - MIWaveTileB: 6 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -15569,16 +15355,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15594,34 +15380,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 64 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15640,23 +15426,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -15674,7 +15460,7 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -15684,7 +15470,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHeN2A1e9y-sybm4-VD6Rl-mXAAhE2oR_OYb0CV2Kb8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3TVE79Gl-4xyNcHlbhJ0nOncgCUZpRly-oCD4jvG5RmM= BufferLoad: true BufferStore: true CUCount: null @@ -15694,9 +15480,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -15715,7 +15501,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -15727,77 +15513,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 64 LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPB: 8 + LVCA: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 + LVPB: 2 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 148992 + LdsBytesNoAmax: 28800 LdsInitCVgprs: false - LdsNumBytes: 148992 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 49920 + LdsNumBytes: 28800 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 74496 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 99072 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 99072 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 384 - MacroTileA: 192 - MacroTileB: 384 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -15810,17 +15596,17 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 288 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 6 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15836,34 +15622,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 65 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 3 - ThreadTileA: 96 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15879,26 +15665,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -15912,7 +15698,7 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false @@ -15926,7 +15712,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kPOJu3SjIZ5ID-BcDpMScvXSs_-HeTA7aISuMEhdHHE= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nBGszv-xk3760QF81fqkAcHrJvY4h-y0n1eJuTMfXK0= BufferLoad: true BufferStore: true CUCount: null @@ -15936,9 +15722,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -15957,7 +15743,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -15969,77 +15755,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 148992 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 148992 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 49920 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 74496 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 99072 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 99072 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 384 - MacroTileA: 192 - MacroTileB: 384 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -16048,19 +15834,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 288 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 6 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -16078,34 +15864,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 66 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 3 - ThreadTileA: 96 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16121,26 +15907,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -16149,14 +15935,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -16168,7 +15954,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1CVWcLkzUFVZguyEpkQVePxOfwkSx0oBkGlgi_Q3qok0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT12-NO75n14OGnRrcE2zJwZ3cI80SyGufNoJCdW7NONHc= BufferLoad: true BufferStore: true CUCount: null @@ -16178,10 +15964,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -16211,77 +15997,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 32 + LSPB: 16 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 106752 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 106752 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [4, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -16295,14 +16081,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -16320,13 +16106,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 67 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -16336,22 +16122,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -16363,7 +16149,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -16379,10 +16165,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -16396,11 +16182,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -16410,6 +16196,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT18r4dBftj4FCGdnY-zJAIV0H8414jv4jjNP1BBgpXMzg= BufferLoad: true BufferStore: true CUCount: null @@ -16419,10 +16206,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -16440,7 +16227,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -16452,77 +16239,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123904 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 123904 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 4] - MIWaveTileA: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -16531,19 +16318,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 192 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -16561,38 +16348,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 68 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 48 + ThreadTileA: 8 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -16604,10 +16391,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -16615,15 +16402,15 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -16637,8 +16424,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -16651,7 +16438,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6IhMdVhMfTI3xBN4uSqbJxvOYzvtnSVMCVX0u78IWd44= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zFRfz_ClGyk9R-xSso3dzQXiskbED_8QHeLF4_pDaC4= BufferLoad: true BufferStore: true CUCount: null @@ -16661,7 +16448,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -16694,72 +16481,72 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 - LSCB: 128 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 + LVCB: 8 LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115200 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 115200 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -16778,13 +16565,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 @@ -16803,13 +16590,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 69 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -16819,18 +16606,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false - SynchronizerSizeCheck: 1 - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16846,26 +16633,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -16881,9 +16668,9 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -16893,7 +16680,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6k48tNKODxoHFqHvsJzLdXfHVMveYQ1hpLblmE2a0eXQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VBy1LqzrPAwHi5usd2GrNHxCFiZX3VQhtQmF5HSKuWc= BufferLoad: true BufferStore: true CUCount: null @@ -16904,9 +16691,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -16924,7 +16711,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -16936,34 +16723,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53376 + LdsBytesNoAmax: 41472 LdsInitCVgprs: false - LdsNumBytes: 53376 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -16971,8 +16758,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -16985,14 +16772,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17006,7 +16793,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -17020,14 +16807,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -17045,8 +16832,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 70 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -17056,7 +16843,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -17069,14 +16856,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17091,8 +16878,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -17121,11 +16908,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -17135,7 +16922,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3X7_irXf-O6GzVKxdiclKJFNBvIMfkuxU-ZfFdCvraBc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1lddss4YYeTrRrhI8j4V2ORHSjVm4oVfckN2DehcfM4Y= BufferLoad: true BufferStore: true CUCount: null @@ -17146,9 +16933,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -17166,7 +16953,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -17178,43 +16965,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 8 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 - LdsPadA: 16 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -17226,15 +17013,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17248,7 +17035,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -17257,21 +17044,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -17287,8 +17074,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 71 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -17298,27 +17085,27 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17330,17 +17117,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -17358,16 +17145,16 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -17377,7 +17164,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3TVE79Gl-4xyNcHlbhJ0nOncgCUZpRly-oCD4jvG5RmM= BufferLoad: true BufferStore: true CUCount: null @@ -17408,7 +17194,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -17420,34 +17206,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 32 + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 8 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28800 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 28800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -17468,15 +17254,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17504,16 +17290,16 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -17529,8 +17315,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 72 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -17540,23 +17326,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17572,18 +17358,18 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -17619,7 +17405,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nBGszv-xk3760QF81fqkAcHrJvY4h-y0n1eJuTMfXK0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AzIzbvcAXxERNGRmUhNYSaITsRVffmpOLjYiPr-GCjU= BufferLoad: true BufferStore: true CUCount: null @@ -17650,7 +17436,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -17662,34 +17448,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57600 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 57600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -17711,14 +17497,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17741,19 +17527,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -17771,8 +17557,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 73 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -17780,9 +17566,9 @@ StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -17795,10 +17581,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17814,11 +17600,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -17842,14 +17628,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -17861,7 +17647,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT18r4dBftj4FCGdnY-zJAIV0H8414jv4jjNP1BBgpXMzg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1hPJ3j3hCQPLY9poEEQoBk953UZJWKGJ-JycqfOwb3oU= BufferLoad: true BufferStore: true CUCount: null @@ -17872,9 +17658,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -17904,7 +17690,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 128 LSCB: 64 @@ -17917,9 +17703,9 @@ LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115200 + LdsBytesNoAmax: 49664 LdsInitCVgprs: false - LdsNumBytes: 115200 + LdsNumBytes: 49664 LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 @@ -17930,7 +17716,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 + LdsOffsetMetadata: 49664 LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 @@ -17939,8 +17725,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -17974,7 +17760,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -17983,7 +17769,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 @@ -18013,8 +17799,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 74 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -18089,8 +17875,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -18103,7 +17889,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zFRfz_ClGyk9R-xSso3dzQXiskbED_8QHeLF4_pDaC4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wYVQmjqRgJkX091wD_hlsjrCn05Q-VGlKyxjofUMhb4= BufferLoad: true BufferStore: true CUCount: null @@ -18134,7 +17920,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -18146,34 +17932,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57600 + LdsBytesNoAmax: 61632 LdsInitCVgprs: false - LdsNumBytes: 57600 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 61632 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -18194,15 +17980,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18230,14 +18016,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -18255,8 +18041,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 75 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -18266,23 +18052,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18298,17 +18084,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -18326,8 +18112,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -18335,7 +18121,7 @@ tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -18345,7 +18131,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VBy1LqzrPAwHi5usd2GrNHxCFiZX3VQhtQmF5HSKuWc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1MdhDwvT9IqqF7BfDnreJ8GZMAtVVUbrvEVE-1LA6oQs= BufferLoad: true BufferStore: true CUCount: null @@ -18388,8 +18174,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 LSCA: 128 LSCB: 64 LSPA: 8 @@ -18399,13 +18185,13 @@ LVPA: 2 LVPB: 4 LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41472 + LdsBytesNoAmax: 55808 LdsInitCVgprs: false - LdsNumBytes: 41472 + LdsNumBytes: 55808 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -18414,7 +18200,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41472 + LdsOffsetMetadata: 55808 LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 @@ -18437,14 +18223,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 2] + MIWaveTile: [2, 5] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 80 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18472,14 +18258,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 5 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -18497,8 +18283,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 76 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -18522,13 +18308,13 @@ SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 5 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -18544,7 +18330,7 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -18568,16 +18354,16 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -18587,7 +18373,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1lddss4YYeTrRrhI8j4V2ORHSjVm4oVfckN2DehcfM4Y= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6ljv8NXufAk_cuVEtHqf5txwlBh_uLK29IZNbPQwu2vM= BufferLoad: true BufferStore: true CUCount: null @@ -18597,10 +18383,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -18630,77 +18416,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115200 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 115200 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -18714,14 +18500,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -18739,13 +18525,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 77 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -18755,18 +18541,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18786,22 +18572,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -18815,11 +18601,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -18829,6 +18615,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2zGGrW_n4jQYF6klG2DYYqkW_uppB7Bvv5hVyqEj8-a4= BufferLoad: true BufferStore: true CUCount: null @@ -18838,7 +18625,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -18871,36 +18658,36 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 2048 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115200 + LdsBytesNoAmax: 140416 LdsInitCVgprs: false - LdsNumBytes: 115200 + LdsNumBytes: 140416 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 37440 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 70208 LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB_Blk: 102976 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata_Blk: 102976 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -18909,10 +18696,10 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -18920,23 +18707,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] + MIWaveTile: [2, 9] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 9 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -18955,14 +18742,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 9 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 9 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -18980,34 +18767,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 78 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 9 + ThreadTileA: 32 + ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19023,26 +18810,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -19051,14 +18838,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -19070,7 +18857,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AzIzbvcAXxERNGRmUhNYSaITsRVffmpOLjYiPr-GCjU= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2L8kdKU9hAEZTTngWUokj3CFx3UF0ntgM4z3Px34u56w= BufferLoad: true BufferStore: true CUCount: null @@ -19080,10 +18867,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -19113,48 +18900,48 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 133120 + LdsNumBytes: 66560 LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 + LdsOffsetA_Blk: 131072 LdsOffsetB: 32768 - LdsOffsetB_Blk: 99328 + LdsOffsetB_Blk: 163840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 163840 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -19166,24 +18953,24 @@ MIWaveTileA: 2 MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -19192,13 +18979,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 128 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -19222,33 +19009,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 79 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 32 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 32 ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true @@ -19275,16 +19062,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -19298,11 +19085,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -19312,7 +19099,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1hPJ3j3hCQPLY9poEEQoBk953UZJWKGJ-JycqfOwb3oU= BufferLoad: true BufferStore: true CUCount: null @@ -19355,8 +19141,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 0 LSCA: 128 LSCB: 64 LSPA: 8 @@ -19365,12 +19151,12 @@ LVCB: 16 LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 49664 + LdsNumBytes: 65536 LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 @@ -19384,29 +19170,29 @@ LdsOffsetMetadata: 49664 LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 MacroTile1: 64 @@ -19417,10 +19203,10 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -19464,8 +19250,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 80 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -19480,22 +19266,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -19511,14 +19297,14 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -19535,14 +19321,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -19554,7 +19340,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wYVQmjqRgJkX091wD_hlsjrCn05Q-VGlKyxjofUMhb4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4pfu4mZMFXl6Px_rOlheAAHmYZJRxgXzi2LZPhoJtkgQ= BufferLoad: true BufferStore: true CUCount: null @@ -19564,10 +19350,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -19597,77 +19383,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61632 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 61632 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 12480 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -19680,15 +19466,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 3 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -19706,13 +19492,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 81 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -19722,18 +19508,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19759,16 +19545,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -19782,9 +19568,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -19796,7 +19582,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1MdhDwvT9IqqF7BfDnreJ8GZMAtVVUbrvEVE-1LA6oQs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6XOuRIWbxGpO8PYAxal26328SbSHHkRpiiwDFpFzd5cE= BufferLoad: true BufferStore: true CUCount: null @@ -19806,10 +19592,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -19827,7 +19613,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -19839,77 +19625,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55808 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 55808 + LdsNumBytes: 132096 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 66048 LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55808 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -19923,14 +19709,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 5 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -19948,34 +19734,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 82 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19991,10 +19777,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -20007,10 +19793,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -20024,9 +19810,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -20038,7 +19824,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6ljv8NXufAk_cuVEtHqf5txwlBh_uLK29IZNbPQwu2vM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZiLNE3I76KjxLJJ0ZoVjNTn_UpkwS6GssxpuNB9QjBg= BufferLoad: true BufferStore: true CUCount: null @@ -20049,9 +19835,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -20069,7 +19855,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -20081,34 +19867,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 53312 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 73728 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -20116,8 +19902,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -20129,15 +19915,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20151,7 +19937,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -20165,14 +19951,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -20190,8 +19976,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 83 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -20201,23 +19987,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20233,17 +20019,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -20266,8 +20052,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -20280,7 +20066,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2zGGrW_n4jQYF6klG2DYYqkW_uppB7Bvv5hVyqEj8-a4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1_c9FQ4VBUegtbjamFxhptq0xUR4EgDEVCIVltcTf-gY= BufferLoad: true BufferStore: true CUCount: null @@ -20311,7 +20097,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -20323,34 +20109,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 256 + LSCA: 128 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 1 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 140416 + LdsBytesNoAmax: 57472 LdsInitCVgprs: false - LdsNumBytes: 140416 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 37440 + LdsNumBytes: 57472 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 70208 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 102976 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 102976 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -20372,14 +20158,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 9] - MIWaveTileA: 2 - MIWaveTileB: 9 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 288 - MacroTileA: 256 - MacroTileB: 288 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20407,14 +20193,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 288 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 8 - NumLoadsB: 9 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -20432,8 +20218,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 84 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -20441,9 +20227,9 @@ StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -20456,10 +20242,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 9 - ThreadTileA: 32 - ThreadTileB: 9 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20475,11 +20261,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -20510,7 +20296,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -20522,7 +20308,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2L8kdKU9hAEZTTngWUokj3CFx3UF0ntgM4z3Px34u56w= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6n7nm_pljv8X1U0nWk_dzuTQG8XL_nFBv6qipTuQt2AM= BufferLoad: true BufferStore: true CUCount: null @@ -20532,10 +20318,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -20565,77 +20351,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 1 - LSCA: 256 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 163840 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 163840 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 8] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 8 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -20649,14 +20435,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 128 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -20674,13 +20460,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 85 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -20698,10 +20484,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20717,26 +20503,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -20745,13 +20531,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false @@ -20764,6 +20550,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6BuOegd-KKwi1l2fNCAczR3-4lBccw6pagnIcRlJE9nQ= BufferLoad: true BufferStore: true CUCount: null @@ -20773,7 +20560,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -20806,39 +20593,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49664 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -20854,15 +20641,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20892,11 +20679,11 @@ NumElementsPerBatchStore: 8 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 @@ -20915,13 +20702,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 86 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -20931,22 +20718,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -20962,22 +20749,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 2] - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -21005,7 +20792,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4pfu4mZMFXl6Px_rOlheAAHmYZJRxgXzi2LZPhoJtkgQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1RK0QdtuAoNj9ZBkBIF8v6fjg0VzdTSnPVrZGC6eM97o= BufferLoad: true BufferStore: true CUCount: null @@ -21015,7 +20802,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -21036,7 +20823,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -21048,36 +20835,36 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 64 - LSPA: 64 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 4 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 31744 + LdsBytesNoAmax: 83968 LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 13312 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 83968 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13312 - LdsOffsetB_Blk: 46080 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 147456 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31744 - LdsOffsetMetadata_Blk: 46080 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 83968 + LdsOffsetMetadata_Blk: 147456 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -21086,10 +20873,10 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -21097,23 +20884,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -21129,17 +20916,17 @@ NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -21157,34 +20944,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 87 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21203,23 +20990,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -21247,7 +21034,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6XOuRIWbxGpO8PYAxal26328SbSHHkRpiiwDFpFzd5cE= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1sFatWmJdqohNPgHurGIn0vTYTspu7sKSxkns5-iI-Zo= BufferLoad: true BufferStore: true CUCount: null @@ -21257,9 +21044,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -21278,7 +21065,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -21290,34 +21077,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 - LSCB: 128 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 + LVCB: 8 LVPA: 4 - LVPB: 2 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 132096 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 132096 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66048 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -21325,10 +21112,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] @@ -21338,15 +21125,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21360,7 +21147,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -21373,15 +21160,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -21399,34 +21186,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 88 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21442,26 +21229,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -21475,9 +21262,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -21489,7 +21276,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZiLNE3I76KjxLJJ0ZoVjNTn_UpkwS6GssxpuNB9QjBg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4adP3l0wOsgdIOylpn3az32jBDr5TLNASEte2YeMUxfU= BufferLoad: true BufferStore: true CUCount: null @@ -21499,10 +21286,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -21532,77 +21319,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53312 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 53312 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 4160 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -21615,15 +21402,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -21641,13 +21428,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 89 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -21657,17 +21444,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 12 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 12 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -21694,16 +21481,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -21712,13 +21499,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -21731,7 +21518,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1_c9FQ4VBUegtbjamFxhptq0xUR4EgDEVCIVltcTf-gY= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9D_9Y2APT5jU_WNRBvEqFeqYqHamCgP7R_Hfo8HPEpQQ= BufferLoad: true BufferStore: true CUCount: null @@ -21743,7 +21530,7 @@ DebugStreamK: 0 DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -21774,34 +21561,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57472 + LdsBytesNoAmax: 61696 LdsInitCVgprs: false - LdsNumBytes: 57472 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 61696 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 45056 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 45056 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -21809,7 +21596,7 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 @@ -21822,15 +21609,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21844,7 +21631,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -21857,15 +21644,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -21883,8 +21670,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 90 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -21899,18 +21686,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 48 + ThreadTile1: 1 + ThreadTileA: 48 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21926,17 +21713,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -21954,12 +21741,12 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false @@ -21973,7 +21760,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6n7nm_pljv8X1U0nWk_dzuTQG8XL_nFBv6qipTuQt2AM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6NchRY6fE5EUP0BO9TxVrOOaZY2mNScnULQhBoZehEAg= BufferLoad: true BufferStore: true CUCount: null @@ -21983,7 +21770,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -22016,39 +21803,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57600 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 57600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -22064,15 +21851,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22100,14 +21887,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -22125,16 +21912,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 91 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 @@ -22142,17 +21929,17 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22168,26 +21955,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -22203,7 +21990,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -22215,7 +22002,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6BuOegd-KKwi1l2fNCAczR3-4lBccw6pagnIcRlJE9nQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT63kDVUoqiyQ9zT3dXF-7c6slWQADFta39wl6nbJ5cQgQ= BufferLoad: true BufferStore: true CUCount: null @@ -22225,10 +22012,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -22246,7 +22033,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -22258,77 +22045,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 18432 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -22337,19 +22124,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -22367,34 +22154,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 92 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22410,26 +22197,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -22438,14 +22225,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -22457,7 +22244,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1RK0QdtuAoNj9ZBkBIF8v6fjg0VzdTSnPVrZGC6eM97o= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT33tdZ_Ane8GWNMOH9Yy9Tto67iQRR3GWjHCXudpxIhQA= BufferLoad: true BufferStore: true CUCount: null @@ -22467,10 +22254,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -22488,7 +22275,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -22500,77 +22287,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 83968 + LdsBytesNoAmax: 28800 LdsInitCVgprs: false - LdsNumBytes: 83968 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 28800 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 147456 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 83968 - LdsOffsetMetadata_Blk: 147456 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 512 - MacroTileA: 128 - MacroTileB: 512 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -22579,21 +22366,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 16 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22609,34 +22396,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 93 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22652,26 +22439,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -22685,8 +22472,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -22699,7 +22486,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1sFatWmJdqohNPgHurGIn0vTYTspu7sKSxkns5-iI-Zo= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6B6yjCgjH545xofiAKGkklYZ8gjK9Z83FXrAiImiJTZs= BufferLoad: true BufferStore: true CUCount: null @@ -22709,9 +22496,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -22730,7 +22517,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -22742,77 +22529,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 148992 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 148992 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 49920 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 74496 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 99072 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 99072 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 384 - MacroTileA: 192 - MacroTileB: 384 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -22821,19 +22608,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 288 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 6 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -22851,34 +22638,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 94 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 3 - ThreadTileA: 96 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22894,26 +22681,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -22922,12 +22709,12 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false @@ -22941,7 +22728,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9D_9Y2APT5jU_WNRBvEqFeqYqHamCgP7R_Hfo8HPEpQQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kJnpDlmDlWwyslngZ-EgItmmqUmgLMBdJbRM__aOPpk= BufferLoad: true BufferStore: true CUCount: null @@ -22953,7 +22740,7 @@ DebugStreamK: 0 DepthU: 32 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -22984,34 +22771,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 2 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61696 + LdsBytesNoAmax: 53312 LdsInitCVgprs: false - LdsNumBytes: 61696 - LdsNumElementsAlignedA: 12288 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 12288 - LdsOffsetB_Blk: 45056 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12288 - LdsOffsetMetadata_Blk: 45056 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -23019,7 +22806,7 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 @@ -23032,15 +22819,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23054,7 +22841,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -23067,15 +22854,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -23093,8 +22880,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 95 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -23109,17 +22896,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 48 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -23136,7 +22923,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -23146,7 +22933,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -23164,14 +22951,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -23183,7 +22970,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6NchRY6fE5EUP0BO9TxVrOOaZY2mNScnULQhBoZehEAg= BufferLoad: true BufferStore: true CUCount: null @@ -23193,10 +22979,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -23226,77 +23012,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 64 - LSCB: 128 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 + LVCB: 8 LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 2048 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 132096 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 132096 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 66048 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98816 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 4] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 256 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -23310,13 +23096,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 @@ -23335,34 +23121,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 96 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23382,22 +23168,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -23406,13 +23192,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -23425,7 +23211,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT63kDVUoqiyQ9zT3dXF-7c6slWQADFta39wl6nbJ5cQgQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6kJPO6d_JImjSWiXy_pK7JIg8RH_UP-eyBc2sQYIVtBk= BufferLoad: true BufferStore: true CUCount: null @@ -23468,7 +23254,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 64 LSCB: 64 @@ -23547,7 +23333,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 @@ -23577,8 +23363,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 97 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -23620,7 +23406,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -23667,7 +23453,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT33tdZ_Ane8GWNMOH9Yy9Tto67iQRR3GWjHCXudpxIhQA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT34wSAq7cxuMKvQcmOPfg83xDDcTX3QGS-2yBFKS0j_is= BufferLoad: true BufferStore: true CUCount: null @@ -23677,7 +23463,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -23710,39 +23496,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 LSCA: 32 - LSCB: 64 - LSPA: 16 + LSCB: 128 + LSPA: 32 LSPB: 8 LVCA: 8 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28800 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 28800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -23778,7 +23564,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -23794,16 +23580,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumMbskPrefetchElements: 16 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -23819,13 +23604,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 98 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -23862,7 +23647,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -23872,16 +23657,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -23909,7 +23694,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6B6yjCgjH545xofiAKGkklYZ8gjK9Z83FXrAiImiJTZs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1gnSdpgupEOvJOiRh45yYCWk8ntMiLyYPoi-roHcedwc= BufferLoad: true BufferStore: true CUCount: null @@ -23940,7 +23725,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -23952,72 +23737,72 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57600 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 57600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -24031,19 +23816,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -24061,8 +23846,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 99 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -24072,22 +23857,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -24104,17 +23889,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [64, 2, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -24132,8 +23917,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -24151,7 +23936,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kJnpDlmDlWwyslngZ-EgItmmqUmgLMBdJbRM__aOPpk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Puz217WUimK6XrH1Pxjc7FrKKVyACIt5zgqPXoHkRJM= BufferLoad: true BufferStore: true CUCount: null @@ -24161,7 +23946,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -24176,7 +23961,7 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel @@ -24184,7 +23969,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -24194,36 +23979,36 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53312 + LdsBytesNoAmax: 24704 LdsInitCVgprs: false - LdsNumBytes: 53312 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 4160 + LdsNumBytes: 24704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -24232,34 +24017,34 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -24273,21 +24058,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 - NumThreads: 256 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24303,13 +24088,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 100 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -24319,17 +24104,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -24346,7 +24131,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -24356,26 +24141,26 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -24393,6 +24178,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1db7_bixCXfVFtS0qxdSmo0ipeanArAwXytvCrGrHgPA= BufferLoad: true BufferStore: true CUCount: null @@ -24402,7 +24188,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -24417,15 +24203,15 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -24435,75 +24221,75 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 128 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 27648 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 27648 + LdsNumElementsAlignedA: 10240 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 43008 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 73728 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 43008 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -24514,20 +24300,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumMbskPrefetchElements: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24544,34 +24329,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 101 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24590,25 +24375,25 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -24622,7 +24407,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -24634,7 +24419,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT34wSAq7cxuMKvQcmOPfg83xDDcTX3QGS-2yBFKS0j_is= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WZ5hxp8HqtUBdolum7CrEAeNqQRa5V_wUt1YK_HUitM= BufferLoad: true BufferStore: true CUCount: null @@ -24644,10 +24429,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -24659,7 +24444,7 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel @@ -24667,7 +24452,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -24677,43 +24462,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 32 - LSCB: 128 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + LSCA: 16 + LSCB: 256 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57600 + LdsBytesNoAmax: 37376 LdsInitCVgprs: false - LdsNumBytes: 57600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -24725,14 +24510,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -24747,7 +24532,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -24761,14 +24546,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24785,13 +24570,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 102 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -24801,9 +24586,9 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -24838,18 +24623,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -24861,9 +24646,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -24875,7 +24660,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1gnSdpgupEOvJOiRh45yYCWk8ntMiLyYPoi-roHcedwc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Jf69jrN9Z4-iQ2t2y60MsQTFsDB9_gIxrHRzz2oaiiQ= BufferLoad: true BufferStore: true CUCount: null @@ -24886,9 +24671,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -24906,7 +24691,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -24918,97 +24703,97 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 - LDSTrInst: 1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 LSCB: 64 - LSPA: 8 + LSPA: 64 LSPB: 16 - LVCA: 32 + LVCA: 4 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 23552 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 23552 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 23552 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 NumThreads: 256 @@ -25027,8 +24812,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 103 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -25038,23 +24823,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25073,14 +24858,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -25098,14 +24883,14 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -25117,7 +24902,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Puz217WUimK6XrH1Pxjc7FrKKVyACIt5zgqPXoHkRJM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT42RnkXZHyflklHCbcqmB86XH77wK5L-QkDV3ZxLjGKKw= BufferLoad: true BufferStore: true CUCount: null @@ -25128,9 +24913,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -25142,7 +24927,7 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel @@ -25150,7 +24935,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -25160,43 +24945,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 LSCA: 16 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 LVCB: 16 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24704 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 24704 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 - LdsPadA: 0 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -25208,15 +24993,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25230,30 +25015,30 @@ MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumMbskPrefetchElements: 16 - NumThreads: 64 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25269,8 +25054,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 104 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -25286,21 +25071,21 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 12 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 12 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -25322,7 +25107,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -25333,20 +25118,20 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false @@ -25359,7 +25144,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WZ5hxp8HqtUBdolum7CrEAeNqQRa5V_wUt1YK_HUitM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6o6Mqj-RUjXtkwZ1K2RrFXlC3Q-a2y2vEOhYluZ4pCok= BufferLoad: true BufferStore: true CUCount: null @@ -25369,7 +25154,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -25384,15 +25169,15 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -25402,39 +25187,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 256 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 4 + LSPB: 16 LVCA: 16 - LVCB: 64 - LVPA: 16 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 37376 + LdsBytesNoAmax: 25600 LdsInitCVgprs: false - LdsNumBytes: 37376 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37376 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -25450,15 +25235,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25470,7 +25255,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -25478,22 +25263,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25510,38 +25296,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 105 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -25556,25 +25342,25 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -25600,7 +25386,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Jf69jrN9Z4-iQ2t2y60MsQTFsDB9_gIxrHRzz2oaiiQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT20mCDNMpt7viEEIOBxhxMtxX9Mc4gjN024wHUZQhAVXc= BufferLoad: true BufferStore: true CUCount: null @@ -25643,34 +25429,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 16 + LSCA: 32 LSCB: 64 - LSPA: 64 + LSPA: 32 LSPB: 16 - LVCA: 4 + LVCA: 8 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 3584 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23552 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 23552 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 37888 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 189440 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23552 - LdsOffsetMetadata_Blk: 37888 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 189440 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 @@ -25691,15 +25477,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25726,15 +25512,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 7 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -25752,8 +25538,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 106 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -25768,22 +25554,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 28 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 28 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -25805,7 +25591,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -25830,7 +25616,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -25842,7 +25628,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6o6Mqj-RUjXtkwZ1K2RrFXlC3Q-a2y2vEOhYluZ4pCok= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yFcFPlsGw5B8R9atFrCRXX3pXqiO9SpMqMMeOGD1qxE= BufferLoad: true BufferStore: true CUCount: null @@ -25867,15 +25653,15 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -25885,35 +25671,35 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 16 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 4 + LSPB: 1 LVCA: 16 - LVCB: 16 + LVCB: 64 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LVPB: 1 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25600 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 25600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -25933,15 +25719,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25961,24 +25747,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 16 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 NumMbskPrefetchElements: 16 - NumThreads: 256 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25994,8 +25780,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 107 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -26005,27 +25791,27 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -26040,14 +25826,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -26058,7 +25844,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -26084,7 +25870,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT20mCDNMpt7viEEIOBxhxMtxX9Mc4gjN024wHUZQhAVXc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KLwYNiZFx8XC0SNCEsNLwUVAoXl9bsjrmRA1WIofO6w= BufferLoad: true BufferStore: true CUCount: null @@ -26094,7 +25880,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -26109,15 +25895,15 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -26127,39 +25913,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 3584 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 16 + LSCB: 256 + LSPA: 16 + LSPB: 1 + LVCA: 16 + LVCB: 256 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 + LdsBytesNoAmax: 37376 LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 58368 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 58368 - LdsOffsetB_Blk: 189440 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 189440 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -26175,15 +25961,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 1] - MIWaveTileA: 7 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 32 - MacroTileA: 224 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26195,7 +25981,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -26203,23 +25989,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalA: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 7 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 14 - NumLoadsB: 2 - NumLoadsCoalescedA: 7 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumMbskPrefetchElements: 16 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26236,13 +26021,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 108 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -26252,17 +26037,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 28 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 28 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -26289,26 +26074,26 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: 0 - enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -26316,7 +26101,7 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -26326,7 +26111,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yFcFPlsGw5B8R9atFrCRXX3pXqiO9SpMqMMeOGD1qxE= BufferLoad: true BufferStore: true CUCount: null @@ -26337,9 +26121,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -26351,15 +26135,15 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: -1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -26369,77 +26153,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 LDSTrInst: 1 - LSCA: 16 + LSCA: 128 LSCB: 64 - LSPA: 4 - LSPB: 1 - LVCA: 16 - LVCB: 64 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -26453,16 +26237,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumMbskPrefetchElements: 16 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26478,8 +26261,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 109 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -26489,23 +26272,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false - SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26521,18 +26303,18 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -26541,8 +26323,8 @@ _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -26554,11 +26336,11 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -26568,7 +26350,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KLwYNiZFx8XC0SNCEsNLwUVAoXl9bsjrmRA1WIofO6w= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6x3uoXhTokfgXutkdVLR7kOydsFIqKOj0o71H_2PuOx8= BufferLoad: true BufferStore: true CUCount: null @@ -26578,10 +26360,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -26593,15 +26375,15 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: -1 - GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -26611,48 +26393,48 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 16 - LSCB: 256 + LSCA: 64 + LSCB: 128 LSPA: 16 - LSPB: 1 + LSPB: 8 LVCA: 16 - LVCB: 256 - LVPA: 16 - LVPB: 1 - LdsBlockSizePerPadA: 256 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 37376 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 37376 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37376 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -26660,49 +26442,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 16 - NumLoadsB: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26719,38 +26501,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 110 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false - SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -26762,28 +26543,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -26795,8 +26576,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -26818,7 +26599,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -26851,39 +26632,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 128 - LSCB: 64 + LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 + LVCB: 8 LVPA: 2 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 106880 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 106880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24960 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -26899,15 +26680,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 192 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26935,14 +26716,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26959,13 +26740,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 111 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -26976,16 +26757,16 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27001,26 +26782,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 2] - WorkGroupMapping: 6 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -27029,8 +26810,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -27048,7 +26829,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6x3uoXhTokfgXutkdVLR7kOydsFIqKOj0o71H_2PuOx8= BufferLoad: true BufferStore: true CUCount: null @@ -27058,7 +26838,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -27091,39 +26871,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 132096 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 132096 + LdsNumBytes: 114944 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66048 + LdsOffsetA_Blk: 65536 LdsOffsetB: 32768 - LdsOffsetB_Blk: 98816 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -27139,15 +26919,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27175,14 +26955,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27199,33 +26979,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 112 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27241,26 +27021,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -27278,7 +27058,7 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -27288,6 +27068,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjSUpYYGuR7UmIaRfL8rbbKYy6b1BuxrRuEwINN-ueM= BufferLoad: true BufferStore: true CUCount: null @@ -27297,7 +27078,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -27313,12 +27094,12 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -27330,36 +27111,36 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 - LSCA: 256 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 1 + LVCA: 4 + LVCB: 64 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 - LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsBytesNoAmax: 25088 + LdsInitCVgprs: false + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -27368,34 +27149,34 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -27406,7 +27187,7 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 @@ -27414,15 +27195,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -27438,33 +27219,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 113 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27483,23 +27264,23 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 6 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -27527,7 +27308,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjSUpYYGuR7UmIaRfL8rbbKYy6b1BuxrRuEwINN-ueM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3wPOxEyiXQz96xeSKClb2IOrvTQ46X8sHYWHz5Dqg9Lc= BufferLoad: true BufferStore: true CUCount: null @@ -27537,7 +27318,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -27570,39 +27351,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 16 - LSCB: 64 - LSPA: 16 - LSPB: 1 - LVCA: 4 - LVCB: 64 - LVPA: 4 - LVPB: 1 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 2 + LVCA: 8 + LVCB: 128 + LVPA: 8 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -27618,14 +27399,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -27647,22 +27428,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 16 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 64 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -27678,13 +27459,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 114 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -27694,9 +27475,9 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -27730,16 +27511,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -27757,263 +27538,263 @@ tailLoopOptB: false - [2, 3, 0, 1] - - - [4, 30, 8192, 128] - - [29, 0.0] + - [21, 0.0] - - [16, 33, 8192, 128] - - [106, 8.99] + - [102, 8.99] - - [40, 61, 8192, 128] - - [0, 0.0] + - [103, 18.28] - - [128, 17711, 1, 960] - - [82, 58.48] + - [75, 58.48] - - [128, 17711, 1, 2480] - - [1, 0.0] + - [0, 0.0] - - [252, 17711, 1, 128] - - [81, 39.0] + - [74, 39.0] - - [256, 17711, 1, 128] - - [2, 0.0] + - [1, 0.0] - - [384, 246, 1, 17711] - - [36, 0.0] + - [28, 0.0] - - [384, 768, 1, 17711] - - [37, 0.0] + - [110, 73.58] - - [928, 17711, 1, 128] - - [65, 59.92] + - [56, 59.92] - - [2732, 17711, 1, 384] - - [64, 90.61] + - [55, 90.61] - - [6, 128, 17711, 41] - - [3, 0.0] + - [2, 0.0] - - [20, 124, 17711, 48] - - [4, 0.0] + - [3, 0.0] - - [41, 6, 17711, 128] - - [5, 0.0] + - [4, 0.0] - - [256, 256, 41, 17711] - - [6, 0.0] + - [5, 0.0] - - [1, 1, 1, 4096] - - [109, 0.0] + - [106, 0.0] - - [1, 4096, 1, 256] - - [105, 0.18] + - [101, 0.18] - - [1, 4096, 1, 512] - - [105, 0.3] + - [101, 0.3] - - [28, 4096, 1, 256] - - [102, 4.54] + - [97, 4.54] - - [28, 4096, 1, 320] - - [98, 5.04] + - [92, 5.04] - - [57, 262144, 1, 32] - - [30, 0.0] + - [22, 0.0] - - [64, 102400, 1, 64] - - [83, 24.71] + - [76, 24.71] - - [64, 131072, 1, 64] - - [31, 0.0] + - [23, 0.0] - - [64, 131072, 1, 128] - - [101, 36.73] + - [95, 36.73] - - [64, 819200, 1, 64] - - [83, 29.19] + - [76, 29.19] - - [72, 4096, 1, 256] - - [99, 9.21] + - [93, 9.21] - - [72, 4096, 1, 320] - - [97, 10.63] + - [91, 10.63] - - [82, 262144, 1, 32] - - [95, 16.68] + - [89, 16.68] - - [116, 4096, 1, 256] - - [7, 0.0] + - [96, 14.16] - - [116, 4096, 1, 320] - - [8, 0.0] + - [65, 16.26] - - [128, 4096, 1, 2048] - - [96, 44.23] + - [90, 44.23] - - [128, 131072, 1, 64] - - [34, 0.0] + - [26, 0.0] - - [160, 655360, 1, 10] - - [66, 7.73] + - [58, 7.73] - - [180, 4096, 1, 256] - - [9, 0.0] + - [88, 19.1] - - [180, 4096, 1, 320] - - [87, 21.73] + - [80, 21.73] - - [192, 655360, 1, 48] - - [94, 34.07] + - [87, 34.07] - - [192, 655360, 1, 112] - - [59, 52.18] + - [50, 52.18] - - [224, 527553, 1, 64] - - [84, 39.78] + - [77, 39.78] - - [224, 752863, 1, 64] - - [85, 41.01] + - [78, 41.01] - - [256, 1, 1, 4096] - - [110, 0.1] + - [107, 0.1] - - [256, 4096, 1, 28] - - [91, 4.17] + - [84, 4.17] - - [256, 4096, 1, 72] - - [100, 9.58] + - [94, 9.58] - - [256, 4096, 1, 116] - - [89, 13.24] + - [82, 13.24] - - [256, 4096, 1, 180] - - [89, 17.83] + - [82, 17.83] - - [256, 4096, 1, 256] - - [88, 26.1] + - [81, 26.1] - - [256, 4096, 1, 7680] - - [113, 86.69] + - [111, 86.69] - - [288, 806154, 1, 64] - - [93, 40.27] + - [86, 40.27] - - [512, 1, 1, 4096] - - [110, 0.2] + - [107, 0.2] - - [512, 4096, 1, 1] - - [92, 0.27] + - [85, 0.27] - - [512, 4096, 1, 160] - - [90, 31.19] + - [83, 31.19] - - [512, 4096, 1, 512] - - [86, 54.79] + - [79, 54.79] - - [512, 4096, 1, 2246] - - [103, 78.03] + - [98, 78.03] - - [512, 4096, 1, 9216] - - [39, 0.0] + - [30, 0.0] - - [512, 4096, 1, 30816] - - [10, 0.0] + - [6, 0.0] - - [1600, 4096, 1, 128] - - [67, 52.68] + - [59, 52.68] - - [1824, 4096, 1, 2048] - - [41, 0.0] + - [32, 0.0] - - [2048, 4096, 1, 57] - - [67, 29.92] + - [59, 29.92] - - [2048, 4096, 1, 64] - - [52, 59310.0] + - [43, 59310.0] - - [2048, 4096, 1, 82] - - [42, 0.0] + - [33, 0.0] - - [2048, 4096, 1, 160] - - [67, 64.51] + - [59, 64.51] - - [2048, 4096, 1, 2048] - - [11, 0.0] + - [7, 0.0] - - [2246, 4096, 1, 2048] - - [12, 0.0] + - [57, 106.06] - - [2560, 4096, 1, 4096] - - [43, 0.0] + - [34, 0.0] - - [2624, 4096, 1, 2048] - - [68, 115.68] + - [60, 115.68] - - [25, 25, 8192, 32] - - [44, 0.0] + - [35, 0.0] - - [32, 25, 8192, 25] - - [45, 0.0] + - [36, 0.0] - - [32, 57, 4096, 64] - - [46, 0.0] + - [37, 0.0] - - [32, 82, 4096, 64] - - [47, 0.0] + - [38, 0.0] - - [48, 192, 4096, 160] - - [48, 0.0] + - [39, 0.0] - - [48, 642, 4096, 160] - - [49, 0.0] + - [40, 0.0] - - [64, 32, 4096, 200] - - [107, 22.3] + - [104, 22.3] - - [200, 32, 4096, 64] - - [108, 15.84] + - [105, 15.84] - - [256, 2048, 1, 128] - - [13, 0.0] + - [8, 0.0] - - [512, 2048, 1, 14336] - - [14, 0.0] + - [9, 0.0] - - [1024, 2048, 1, 128] - - [40, 0.0] + - [31, 0.0] - - [1024, 2048, 1, 14336] - - [15, 0.0] + - [10, 0.0] - - [1, 8192, 1, 128] - - [16, 0.0] + - [11, 0.0] - - [1, 8192, 1, 256] - - [17, 0.0] + - [100, 0.3] - - [120, 8192, 1, 256] - - [50, 52872.0] + - [41, 52872.0] - - [128, 1, 1, 8192] - - [114, 0.1] + - [112, 0.1] - - [128, 8192, 1, 256] - - [18, 0.0] + - [66, 25.59] - - [128, 8192, 1, 2440] - - [74, 53.83] + - [67, 53.83] - - [128, 8192, 1, 5120] - - [19, 0.0] + - [12, 0.0] - - [128, 8192, 1, 5640] - - [20, 0.0] + - [13, 0.0] - - [256, 1, 1, 8192] - - [21, 0.0] + - [113, 0.19] - - [256, 8192, 1, 512] - - [77, 54.43] + - [70, 54.43] - - [256, 8192, 1, 528] - - [78, 50.73] + - [71, 50.73] - - [256, 8192, 1, 2048] - - [79, 71.81] + - [72, 71.81] - - [256, 98304, 1, 128] - - [22, 0.0] + - [14, 0.0] - - [512, 8192, 1, 120] - - [80, 35.22] + - [73, 35.22] - - [512, 8192, 1, 512] - - [23, 0.0] + - [15, 0.0] - - [512, 8192, 1, 528] - - [24, 0.0] + - [16, 0.0] - - [512, 8192, 1, 1980] - - [25, 0.0] + - [17, 0.0] - - [512, 8192, 1, 2048] - - [26, 0.0] + - [18, 0.0] - - [512, 8192, 1, 3072] - - [27, 0.0] + - [19, 0.0] - - [528, 8192, 1, 256] - - [28, 0.0] + - [20, 0.0] - - [10880, 8192, 1, 128] - - [63, 65.6] + - [54, 65.6] - - [1, 1024, 1, 128] - - [104, 0.02] + - [99, 0.02] - - [1, 4096, 1, 1] - - [104, 0.0] + - [99, 0.0] - - [128, 1, 1, 1024] - - [32, 0.0] + - [24, 0.0] - - [128, 41, 1, 17711] - - [33, 0.0] + - [25, 0.0] - - [128, 1024, 1, 128] - - [72, 2.92] + - [64, 2.92] - - [128, 1024, 1, 4096] - - [69, 28.91] + - [61, 28.91] - - [128, 1024, 1, 7456] - - [112, 39.35] + - [109, 39.35] - - [128, 17711, 1, 128] - - [82, 25.45] + - [75, 25.45] - - [233, 131072, 1, 56] - - [35, 0.0] + - [27, 0.0] - - [256, 1024, 1, 128] - - [70, 5.57] + - [62, 5.57] - - [512, 1024, 1, 128] - - [73, 9.94] + - [65, 9.94] - - [512, 1024, 1, 2011] - - [38, 0.0] + - [29, 0.0] - - [4096, 1024, 1, 128] - - [55, 44.33] + - [46, 44.33] - - [32, 233, 1024, 128] - - [54, 53858.6] + - [44, 53858.6] - - [256, 8192, 1, 256] - - [77, 40.67] + - [70, 40.67] - - [512, 8192, 1, 256] - - [51, 122522.0] + - [42, 122522.0] - - [1024, 8192, 1, 512] - - [57, 95.36] + - [48, 95.36] - - [2011, 1024, 1, 512] - - [56, 55.01] + - [47, 55.01] - - [7968, 1024, 1, 256] - - [53, 135836.0] + - [45, 76.41] - - [3072, 8192, 1, 512] - - [58, 103.91] + - [49, 103.91] - - [4352, 8192, 1, 256] - - [59, 87.22] + - [50, 87.22] - - [4608, 8192, 1, 256] - - [60, 86.41] + - [51, 86.41] - - [5120, 8192, 1, 128] - - [61, 63.17] + - [52, 63.17] - - [5640, 8192, 1, 128] - - [62, 58.59] + - [53, 58.59] - - [7296, 8192, 1, 128] - - [60, 65.76] + - [51, 65.76] - - [4132, 4096, 1, 256] - - [66, 77.55] + - [58, 77.55] - - [4132, 4096, 1, 512] - - [66, 98.72] + - [58, 98.72] - - [128, 1024, 1, 1] - - [71, 0.03] + - [63, 0.03] - - [256, 8192, 1, 1] - - [75, 0.28] + - [68, 0.28] - - [256, 8192, 1, 120] - - [76, 22.65] + - [69, 22.65] - - [256, 4096, 1, 1] - - [91, 0.15] + - [84, 0.15] - - [256, 1024, 1, 7968] - - [111, 59.32] + - [108, 59.32] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml index 1fdae462c24..f5e8e378f9e 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml @@ -90,7 +90,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6IhMdVhMfTI3xBN4uSqbJxvOYzvtnSVMCVX0u78IWd44= BufferLoad: true BufferStore: true CUCount: null @@ -100,7 +99,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -133,72 +132,72 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 115200 LdsInitCVgprs: false LdsNumBytes: 115200 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -217,14 +216,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -243,12 +242,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -258,18 +257,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -285,26 +284,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -313,8 +312,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -375,7 +374,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 LSCB: 32 @@ -485,7 +484,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -527,7 +526,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -616,7 +615,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 LSCA: 64 LSCB: 64 @@ -726,7 +725,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -768,7 +767,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -858,7 +857,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 128 LSCB: 32 @@ -968,7 +967,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -1010,7 +1009,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -1341,7 +1340,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 LSCB: 32 @@ -1451,7 +1450,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -1493,7 +1492,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -1583,7 +1582,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 256 LSCB: 32 @@ -1693,7 +1692,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -1735,7 +1734,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -2067,7 +2066,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 256 LSCB: 32 @@ -2177,7 +2176,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -2219,7 +2218,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -2308,7 +2307,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 LSCB: 32 @@ -2418,7 +2417,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -2460,7 +2459,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -2792,7 +2791,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 LSCB: 32 @@ -2902,7 +2901,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -2944,7 +2943,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -2991,7 +2990,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kPOJu3SjIZ5ID-BcDpMScvXSs_-HeTA7aISuMEhdHHE= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2SSMrbopPykWoZTFVNLV97OiwO0KcDliSRSdRIxQZbhY= BufferLoad: true BufferStore: true CUCount: null @@ -3002,9 +3001,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true + DirectToLds: 0 DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -3034,34 +3033,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x160x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 256 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 + LVPA: 1 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 148992 + LdsBytesNoAmax: 121344 LdsInitCVgprs: false - LdsNumBytes: 148992 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 49920 + LdsNumBytes: 121344 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 74496 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 99072 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 99072 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -3070,7 +3069,7 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -3082,15 +3081,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 384 - MacroTileA: 192 - MacroTileB: 384 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3113,19 +3112,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 288 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 6 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -3144,7 +3143,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x160x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -3152,25 +3151,25 @@ StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 3 - ThreadTileA: 96 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3196,7 +3195,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -3220,8 +3219,8 @@ reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -3233,7 +3232,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1CVWcLkzUFVZguyEpkQVePxOfwkSx0oBkGlgi_Q3qok0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kPOJu3SjIZ5ID-BcDpMScvXSs_-HeTA7aISuMEhdHHE= BufferLoad: true BufferStore: true CUCount: null @@ -3244,9 +3243,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -3276,34 +3275,276 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1CVWcLkzUFVZguyEpkQVePxOfwkSx0oBkGlgi_Q3qok0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -3385,7 +3626,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 13 + SolutionIndex: 14 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -3626,7 +3867,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 14 + SolutionIndex: 15 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -3716,7 +3957,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6k48tNKODxoHFqHvsJzLdXfHVMveYQ1hpLblmE2a0eXQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6IhMdVhMfTI3xBN4uSqbJxvOYzvtnSVMCVX0u78IWd44= BufferLoad: true BufferStore: true CUCount: null @@ -3726,7 +3967,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -3747,7 +3988,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -3759,14 +4000,256 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6k48tNKODxoHFqHvsJzLdXfHVMveYQ1hpLblmE2a0eXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 1024 @@ -3868,8 +4351,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -3911,7 +4394,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -4110,7 +4593,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 16 + SolutionIndex: 18 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -4243,7 +4726,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 1 LSCA: 32 LSCB: 64 @@ -4352,8 +4835,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -4395,7 +4878,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -4485,7 +4968,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 LSCB: 64 @@ -4594,8 +5077,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -4637,7 +5120,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -4674,7 +5157,249 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT12-NO75n14OGnRrcE2zJwZ3cI80SyGufNoJCdW7NONHc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 106752 + LdsInitCVgprs: false + LdsNumBytes: 106752 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 @@ -4727,7 +5452,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 128 LSCB: 64 @@ -4836,8 +5561,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -4879,7 +5604,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -4969,7 +5694,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 LSCB: 32 @@ -5078,8 +5803,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -5121,7 +5846,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -5320,7 +6045,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 21 + SolutionIndex: 24 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -5453,7 +6178,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 128 LSCB: 64 @@ -5562,8 +6287,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -5605,7 +6330,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -5694,7 +6419,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 128 LSCB: 64 @@ -5803,8 +6528,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -5846,7 +6571,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -5936,7 +6661,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 128 LSCB: 64 @@ -6045,8 +6770,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -6088,7 +6813,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -6287,7 +7012,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 25 + SolutionIndex: 28 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -6420,7 +7145,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 LSCA: 128 LSCB: 32 @@ -6529,8 +7254,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -6572,7 +7297,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -6771,7 +7496,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 27 + SolutionIndex: 30 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -7013,7 +7738,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 28 + SolutionIndex: 31 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -7146,7 +7871,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 LSCA: 256 LSCB: 32 @@ -7255,8 +7980,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -7298,7 +8023,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -7497,7 +8222,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 30 + SolutionIndex: 33 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -7738,7 +8463,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 31 + SolutionIndex: 34 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -7980,7 +8705,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 32 + SolutionIndex: 35 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -8113,7 +8838,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 64 LSCB: 128 @@ -8222,8 +8947,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 33 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8265,7 +8990,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -8355,7 +9080,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 LSCA: 128 LSCB: 32 @@ -8464,8 +9189,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 34 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8507,7 +9232,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -8597,7 +9322,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 LSCA: 128 LSCB: 32 @@ -8706,8 +9431,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 35 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8749,7 +9474,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -8839,7 +9564,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 LSCB: 64 @@ -8948,8 +9673,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 36 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8991,7 +9716,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -9190,7 +9915,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 37 + SolutionIndex: 40 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -9432,7 +10157,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 38 + SolutionIndex: 41 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -9565,7 +10290,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 LSCB: 32 @@ -9674,8 +10399,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -9717,7 +10442,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -9752,7 +10477,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -9764,7 +10489,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9D_9Y2APT5jU_WNRBvEqFeqYqHamCgP7R_Hfo8HPEpQQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4adP3l0wOsgdIOylpn3az32jBDr5TLNASEte2YeMUxfU= BufferLoad: true BufferStore: true CUCount: null @@ -9774,10 +10499,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true + DepthU: 64 + DirectToLds: 0 DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -9807,48 +10532,48 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61696 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 61696 - LdsNumElementsAlignedA: 12288 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 12288 - LdsOffsetB_Blk: 45056 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12288 - LdsOffsetMetadata_Blk: 45056 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -9860,19 +10585,19 @@ MIWaveTileA: 3 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 @@ -9891,8 +10616,8 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 6 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 3 NumLoadsB: 4 NumLoadsCoalescedA: 3 @@ -9916,13 +10641,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -9932,17 +10657,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 12 ThreadTile1: 1 - ThreadTileA: 48 + ThreadTileA: 12 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -9969,16 +10694,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 @@ -9987,13 +10712,13 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: false + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -10006,7 +10731,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6NchRY6fE5EUP0BO9TxVrOOaZY2mNScnULQhBoZehEAg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9D_9Y2APT5jU_WNRBvEqFeqYqHamCgP7R_Hfo8HPEpQQ= BufferLoad: true BufferStore: true CUCount: null @@ -10016,9 +10741,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true DirectToVgprA: 0 DirectToVgprB: 0 @@ -10037,7 +10762,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -10049,77 +10774,77 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 2048 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 132096 + LdsBytesNoAmax: 61696 LdsInitCVgprs: false - LdsNumBytes: 132096 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 61696 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66048 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 45056 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 45056 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -10132,16 +10857,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumMbskPrefetchElements: 16 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10158,8 +10883,250 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 1 + ThreadTileA: 48 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6NchRY6fE5EUP0BO9TxVrOOaZY2mNScnULQhBoZehEAg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 132096 + LdsInitCVgprs: false + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -10201,7 +11168,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -10291,7 +11258,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 64 LSCB: 64 @@ -10400,8 +11367,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -10443,7 +11410,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -10533,7 +11500,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 1 LSCA: 32 LSCB: 64 @@ -10642,8 +11609,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -10685,7 +11652,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -10775,7 +11742,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 LSCB: 64 @@ -10884,8 +11851,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -10927,7 +11894,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -11017,7 +11984,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 LSCA: 128 LSCB: 32 @@ -11126,8 +12093,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -11169,7 +12136,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -11367,7 +12334,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 46 + SolutionIndex: 50 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -11445,6 +12412,248 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6kJPO6d_JImjSWiXy_pK7JIg8RH_UP-eyBc2sQYIVtBk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false @@ -11500,7 +12709,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 LSCA: 32 LSCB: 128 @@ -11608,8 +12817,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 47 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -11651,7 +12860,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -11741,7 +12950,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 LDSTrInst: 1 LSCA: 128 LSCB: 64 @@ -11803,15 +13012,499 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Puz217WUimK6XrH1Pxjc7FrKKVyACIt5zgqPXoHkRJM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24704 + LdsInitCVgprs: false + LdsNumBytes: 24704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1db7_bixCXfVFtS0qxdSmo0ipeanArAwXytvCrGrHgPA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27648 + LdsInitCVgprs: false + LdsNumBytes: 27648 + LdsNumElementsAlignedA: 10240 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 43008 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 43008 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -11820,20 +13513,19 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11850,18 +13542,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 48 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -11874,10 +13566,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11896,25 +13588,25 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -11926,9 +13618,9 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -11940,7 +13632,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Puz217WUimK6XrH1Pxjc7FrKKVyACIt5zgqPXoHkRJM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WZ5hxp8HqtUBdolum7CrEAeNqQRa5V_wUt1YK_HUitM= BufferLoad: true BufferStore: true CUCount: null @@ -11950,10 +13642,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: 0 DirectToVgprB: 0 DirectToVgprSparseMetadata: false @@ -11983,43 +13675,43 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 LSCA: 16 - LSCB: 64 - LSPA: 4 + LSCB: 256 + LSPA: 16 LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 64 + LVPA: 16 LVPB: 1 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24704 + LdsBytesNoAmax: 37376 LdsInitCVgprs: false - LdsNumBytes: 24704 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 - LdsPadA: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false @@ -12051,9 +13743,9 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 1 + MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -12067,16 +13759,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 - NumMbskPrefetchElements: 16 - NumThreads: 64 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12092,13 +13783,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 49 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -12145,16 +13836,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -12168,8 +13859,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false @@ -12182,7 +13873,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WZ5hxp8HqtUBdolum7CrEAeNqQRa5V_wUt1YK_HUitM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Jf69jrN9Z4-iQ2t2y60MsQTFsDB9_gIxrHRzz2oaiiQ= BufferLoad: true BufferStore: true CUCount: null @@ -12192,7 +13883,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -12207,7 +13898,7 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel @@ -12215,7 +13906,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -12225,39 +13916,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 LSCA: 16 - LSCB: 256 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 64 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 LVPA: 16 - LVPB: 1 + LVPB: 4 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 37376 + LdsBytesNoAmax: 23552 LdsInitCVgprs: false - LdsNumBytes: 37376 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 23552 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 37376 - LdsOffsetMetadata_Blk: 86016 + LdsOffsetMetadata: 23552 + LdsOffsetMetadata_Blk: 37888 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -12273,15 +13964,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [1, 4] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12293,7 +13984,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -12302,21 +13993,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12333,13 +14025,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 50 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -12350,9 +14042,9 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -12386,26 +14078,26 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -12423,7 +14115,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Jf69jrN9Z4-iQ2t2y60MsQTFsDB9_gIxrHRzz2oaiiQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT42RnkXZHyflklHCbcqmB86XH77wK5L-QkDV3ZxLjGKKw= BufferLoad: true BufferStore: true CUCount: null @@ -12466,7 +14158,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 16 LSCB: 64 @@ -12476,24 +14168,24 @@ LVCB: 16 LVPA: 16 LVPB: 4 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 768 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23552 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 23552 - LdsNumElementsAlignedA: 5120 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 37888 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23552 - LdsOffsetMetadata_Blk: 37888 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 @@ -12515,13 +14207,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [3, 1] + MIWaveTileA: 3 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 48 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 48 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -12548,13 +14240,13 @@ NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 @@ -12575,8 +14267,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 51 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -12599,14 +14291,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 12 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 12 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -12817,7 +14509,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 52 + SolutionIndex: 59 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -13059,7 +14751,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 53 + SolutionIndex: 60 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -13301,7 +14993,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 54 + SolutionIndex: 61 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -13542,7 +15234,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 55 + SolutionIndex: 62 SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -13674,7 +15366,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 LDSTrInst: 1 LSCA: 128 LSCB: 64 @@ -13782,8 +15474,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 56 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -13824,7 +15516,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -13914,7 +15606,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 LSCA: 64 LSCB: 128 @@ -14022,8 +15714,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 57 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -14064,26 +15756,265 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 106880 + LdsInitCVgprs: false + LdsNumBytes: 106880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24960 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -14092,8 +16023,8 @@ _staggerStrideShift: 0 enableGLTrA: 0 enableGLTrB: 0 - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -14153,7 +16084,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 LSCA: 256 LSCB: 32 @@ -14261,8 +16192,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 58 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCGn1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -14393,7 +16324,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 LSCA: 16 LSCB: 64 @@ -14501,8 +16432,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 59 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -14578,9 +16509,249 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3wPOxEyiXQz96xeSKClb2IOrvTQ46X8sHYWHz5Dqg9Lc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 2 + LVCA: 8 + LVCB: 128 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57856 + LdsInitCVgprs: false + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] -- - - [128, 1024, 1, 4096] - - [0, 28.91] +- - - [7968, 1024, 1, 256] + - [0, 76.41] - - [4096, 1024, 1, 128] - [1, 44.33] - - [2011, 1024, 1, 512] @@ -14605,128 +16776,148 @@ - [10, 90.61] - - [928, 17711, 1, 128] - [11, 59.92] + - - [2246, 4096, 1, 2048] + - [12, 106.06] - - [4132, 4096, 1, 256] - - [12, 77.55] + - [13, 77.55] - - [2048, 4096, 1, 160] - - [13, 64.51] + - [14, 64.51] - - [4132, 4096, 1, 512] - - [12, 98.72] + - [13, 98.72] - - [2624, 4096, 1, 2048] - - [14, 115.68] + - [15, 115.68] - - [2048, 4096, 1, 57] - - [13, 29.92] + - [14, 29.92] - - [1600, 4096, 1, 128] - - [13, 52.68] + - [14, 52.68] + - - [128, 1024, 1, 4096] + - [16, 28.91] - - [256, 1024, 1, 128] - - [15, 5.57] + - [17, 5.57] - - [128, 1024, 1, 1] - - [16, 0.03] + - [18, 0.03] - - [128, 1024, 1, 128] - - [17, 2.92] + - [19, 2.92] - - [512, 1024, 1, 128] - - [18, 9.94] + - [20, 9.94] + - - [128, 8192, 1, 256] + - [21, 25.59] - - [128, 8192, 1, 2440] - - [19, 53.83] + - [22, 53.83] - - [256, 8192, 1, 1] - - [20, 0.28] + - [23, 0.28] - - [256, 8192, 1, 120] - - [21, 22.65] + - [24, 22.65] - - [256, 8192, 1, 256] - - [22, 40.67] + - [25, 40.67] - - [256, 8192, 1, 512] - - [22, 54.43] + - [25, 54.43] - - [256, 8192, 1, 528] - - [23, 50.73] + - [26, 50.73] - - [256, 8192, 1, 2048] - - [24, 71.81] + - [27, 71.81] - - [512, 8192, 1, 120] - - [25, 35.22] + - [28, 35.22] - - [252, 17711, 1, 128] - - [26, 39.0] + - [29, 39.0] - - [128, 17711, 1, 128] - - [27, 25.45] + - [30, 25.45] - - [128, 17711, 1, 960] - - [27, 58.48] + - [30, 58.48] - - [64, 819200, 1, 64] - - [28, 29.19] + - [31, 29.19] - - [224, 527553, 1, 64] - - [29, 39.78] + - [32, 39.78] - - [224, 752863, 1, 64] - - [30, 41.01] + - [33, 41.01] - - [512, 4096, 1, 512] - - [31, 54.79] + - [34, 54.79] - - [180, 4096, 1, 320] - - [32, 21.73] + - [35, 21.73] - - [256, 4096, 1, 256] - - [33, 26.1] + - [36, 26.1] - - [256, 4096, 1, 180] - - [34, 17.83] + - [37, 17.83] - - [512, 4096, 1, 160] - - [35, 31.19] + - [38, 31.19] - - [256, 4096, 1, 116] - - [34, 13.24] + - [37, 13.24] - - [256, 4096, 1, 28] - - [36, 4.17] + - [39, 4.17] - - [512, 4096, 1, 1] - - [37, 0.27] + - [40, 0.27] - - [256, 4096, 1, 1] - - [36, 0.15] + - [39, 0.15] - - [192, 655360, 1, 112] - [5, 52.18] - - [288, 806154, 1, 64] - - [38, 40.27] + - [41, 40.27] - - [192, 655360, 1, 48] - - [39, 34.07] + - [42, 34.07] + - - [116, 4096, 1, 320] + - [20, 16.26] + - - [180, 4096, 1, 256] + - [43, 19.1] - - [82, 262144, 1, 32] - - [40, 16.68] + - [44, 16.68] - - [128, 4096, 1, 2048] - - [41, 44.23] + - [45, 44.23] - - [72, 4096, 1, 320] - - [42, 10.63] + - [46, 10.63] - - [28, 4096, 1, 320] - - [43, 5.04] + - [47, 5.04] - - [64, 102400, 1, 64] - - [28, 24.71] + - [31, 24.71] - - [72, 4096, 1, 256] - - [44, 9.21] + - [48, 9.21] - - [256, 4096, 1, 72] - - [45, 9.58] + - [49, 9.58] - - [160, 655360, 1, 10] - - [12, 7.73] + - [13, 7.73] - - [64, 131072, 1, 128] - - [46, 36.73] + - [50, 36.73] + - - [116, 4096, 1, 256] + - [51, 14.16] - - [28, 4096, 1, 256] - - [47, 4.54] + - [52, 4.54] - - [512, 4096, 1, 2246] - - [48, 78.03] + - [53, 78.03] - - [1, 1024, 1, 128] - - [49, 0.02] + - [54, 0.02] + - - [1, 8192, 1, 256] + - [55, 0.3] - - [1, 4096, 1, 512] - - [50, 0.3] + - [56, 0.3] - - [1, 4096, 1, 256] - - [50, 0.18] + - [56, 0.18] - - [1, 4096, 1, 1] - - [49, 0.0] + - [54, 0.0] - - [16, 33, 8192, 128] - - [51, 8.99] + - [57, 8.99] + - - [40, 61, 8192, 128] + - [58, 18.28] - - [64, 32, 4096, 200] - - [52, 22.3] + - [59, 22.3] - - [200, 32, 4096, 64] - - [53, 15.84] + - [60, 15.84] - - [1, 1, 1, 4096] - - [54, 0.0] + - [61, 0.0] - - [512, 1, 1, 4096] - - [55, 0.2] + - [62, 0.2] - - [256, 1, 1, 4096] - - [55, 0.1] + - [62, 0.1] - - [256, 1024, 1, 7968] - - [56, 59.32] + - [63, 59.32] - - [128, 1024, 1, 7456] - - [57, 39.35] + - [64, 39.35] + - - [384, 768, 1, 17711] + - [65, 73.58] - - [256, 4096, 1, 7680] - - [58, 86.69] + - [66, 86.69] - - [128, 1, 1, 8192] - - [59, 0.1] + - [67, 0.1] + - - [256, 1, 1, 8192] + - [68, 0.19] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index d1341df36c7..47eccaf2f45 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -801,12 +801,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x32x32_MI32xpBVJCKYVdddJ9ZWJn_y6OBqYt3-BIpDKWLuXbExc_BA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32HDLrFKOalrtw2KSVTZ7XOtxf5e65xHrmK1BWhomWgmY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -821,7 +821,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -831,7 +831,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -843,34 +843,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22016 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 22016 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22016 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -883,7 +883,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -891,15 +891,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -920,29 +920,29 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 6 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -952,38 +952,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 64 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 64 ThreadTileB: 1 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -995,16 +995,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1017,7 +1017,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -1038,17 +1038,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x96x64_MI16x1b_n0pL9wvqKZmPxvI_atrAx5msSEDab_k9FB6A5LHIE= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x288x32_MI16jlcgKB9N4SyOB1ZF1cNtZuU9XEPy1lSQN0hbLApWEVk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -1068,7 +1068,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1080,34 +1080,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120832 + LdsBytesNoAmax: 147456 LdsInitCVgprs: false - LdsNumBytes: 120832 + LdsNumBytes: 147456 LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedB: 46080 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 73728 LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -1117,8 +1117,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -1129,14 +1129,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveTile: [6, 9] + MIWaveTileA: 6 + MIWaveTileB: 9 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 + MacroTile0: 192 + MacroTile1: 288 + MacroTileA: 192 + MacroTileB: 288 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1157,21 +1157,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 3 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 + NumElementsPerThread: 216 + NumGlobalWriteVectorsPerThread: 108 NumLoadsA: 6 - NumLoadsB: 6 + NumLoadsB: 9 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 9 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1180,7 +1180,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1189,17 +1189,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -1211,16 +1211,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 1 + ThreadTile0: 24 + ThreadTile1: 9 + ThreadTileA: 24 + ThreadTileB: 9 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1232,7 +1232,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -1245,10 +1245,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -1263,7 +1263,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -1275,7 +1275,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x288x32_MI16jlcgKB9N4SyOB1ZF1cNtZuU9XEPy1lSQN0hbLApWEVk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x224x32_MI16xExhRy6GXUDd_JG6KWPknC-ImNIhRsFD21cB5RtDRgf4= BufferLoad: true BufferStore: true CUCount: null @@ -1295,17 +1295,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1317,34 +1317,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 46080 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 35840 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 68096 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 68096 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -1365,15 +1365,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 9] - MIWaveTileA: 6 - MIWaveTileB: 9 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 7] + MIWaveTileA: 1 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 288 - MacroTileA: 192 - MacroTileB: 288 + MacroTile0: 16 + MacroTile1: 224 + MacroTileA: 16 + MacroTileB: 224 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1394,30 +1394,30 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 3 + NonTemporalB: 5 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 216 - NumGlobalWriteVectorsPerThread: 108 - NumLoadsA: 6 - NumLoadsB: 9 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 9 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1426,38 +1426,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 9 - ThreadTileA: 24 - ThreadTileB: 9 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 7 + ThreadTileA: 4 + ThreadTileB: 7 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1469,15 +1469,15 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -1512,7 +1512,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x128x32_MI16xNV2XQhEHlt21rI1bJFo8vAnWSfz8zM76EEZNUDVuySI= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI16xFTZn1oT96nznuMUL1F717od6NlDNu3Zl5rceKzvu6pw= BufferLoad: true BufferStore: true CUCount: null @@ -1532,7 +1532,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -1542,7 +1542,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1554,34 +1554,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 7680 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 7680 - LdsOffsetB_Blk: 40448 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 40448 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -1594,7 +1594,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -1602,14 +1602,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 + MacroTile0: 32 MacroTile1: 128 - MacroTileA: 48 + MacroTileA: 32 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -1630,30 +1630,30 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 + NonTemporalA: 1 + NonTemporalB: 3 NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 3 - NumLoadsB: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -1663,38 +1663,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 4 - ThreadTileA: 12 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1706,16 +1706,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1737,7 +1737,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -1749,31 +1749,31 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x224x32_MI16xExhRy6GXUDd_JG6KWPknC-ImNIhRsFD21cB5RtDRgf4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xMGDb4WqTAUNdXONPi4f_hVkBQx-lOyX3fNSJSClAZJM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -1791,34 +1791,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38400 + LdsBytesNoAmax: 60224 LdsInitCVgprs: false - LdsNumBytes: 38400 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 35840 + LdsNumBytes: 60224 + LdsNumElementsAlignedA: 6336 + LdsNumElementsAlignedB: 21120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 68096 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 6336 + LdsOffsetB_Blk: 39104 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38400 - LdsOffsetMetadata_Blk: 68096 + LdsOffsetMetadata: 6336 + LdsOffsetMetadata_Blk: 39104 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -1826,8 +1826,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -1840,14 +1840,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 2] - MIWaveTile: [1, 7] - MIWaveTileA: 1 - MIWaveTileB: 7 + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 224 - MacroTileA: 16 - MacroTileB: 224 + MacroTile0: 48 + MacroTile1: 160 + MacroTileA: 48 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1861,28 +1861,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 3 NonTemporalB: 5 - NonTemporalC: 0 + NonTemporalC: 1 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 14 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 10 NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1890,8 +1890,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1900,12 +1900,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM4_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -1922,16 +1922,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 7 - ThreadTileA: 4 - ThreadTileB: 7 - TransposeLDS: 1 + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1951,8 +1951,8 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1974,7 +1974,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -1986,27 +1986,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI16xFTZn1oT96nznuMUL1F717od6NlDNu3Zl5rceKzvu6pw= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xceJrtbRzoZ8U_X3c8TU-g5N3iVDPEh4CXnMQuJUum-0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -2016,7 +2016,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2028,34 +2028,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 37376 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -2063,10 +2063,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -2076,15 +2076,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2098,27 +2098,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 2 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -2127,7 +2127,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -2137,7 +2137,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -2146,29 +2146,29 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2180,23 +2180,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -2211,7 +2211,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -2223,27 +2223,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI16xc_P3dMzRTIuQ28njVtEwaNCN95ZzWdJfgzk_qtoBxJM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xKd3xOQJnZvH7NER92h_FWeDLMN6llBgbuSKPfS-dh_8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -2253,7 +2253,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2265,34 +2265,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 37376 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -2300,10 +2300,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -2313,15 +2313,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2335,27 +2335,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 + NonTemporalA: 0 + NonTemporalB: 6 NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -2364,7 +2364,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -2374,7 +2374,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -2383,22 +2383,22 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -2417,23 +2417,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -2448,7 +2448,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -2460,27 +2460,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xMGDb4WqTAUNdXONPi4f_hVkBQx-lOyX3fNSJSClAZJM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16_yRrZ54pxRgRukp1Qzk0QYQwx3MBBzFzyA0wCMrV_ho= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -2490,7 +2490,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2502,34 +2502,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60224 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 60224 - LdsNumElementsAlignedA: 6336 - LdsNumElementsAlignedB: 21120 - LdsNumElementsAlignedMetadata: 0 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 6336 - LdsOffsetB_Blk: 39104 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 6336 - LdsOffsetMetadata_Blk: 39104 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -2537,8 +2537,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -2550,15 +2550,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [3, 5] - MIWaveTileA: 3 - MIWaveTileB: 5 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 160 - MacroTileA: 48 - MacroTileB: 160 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2572,36 +2572,36 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 3 - NumLoadsB: 10 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 10 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -2611,32 +2611,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 5 - ThreadTileA: 12 - ThreadTileB: 5 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2654,16 +2654,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -2685,7 +2685,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -2697,7 +2697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xceJrtbRzoZ8U_X3c8TU-g5N3iVDPEh4CXnMQuJUum-0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x1tj_kzs63yw3mFQwAEJiL4iTevb3pL7nbZor8PNO8HKQ= BufferLoad: true BufferStore: true CUCount: null @@ -2707,10 +2707,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2739,34 +2739,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 46592 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 46592 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -2774,12 +2774,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -2787,15 +2787,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2809,27 +2809,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 5 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -2848,13 +2848,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -2863,23 +2863,23 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2892,22 +2892,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -2922,7 +2922,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -2934,7 +2934,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x64x64_MI16x1ZYzOMFyvQ_WZv4g-rr80VpnruR-8YsAaclS0uKKTkXQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3okxSER2kK2UA9ov1OBTPhQlusC6KlbBX40fJhWBRxnQ= BufferLoad: true BufferStore: true CUCount: null @@ -2944,7 +2944,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -2976,36 +2976,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB4_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -3014,10 +3014,10 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -3025,23 +3025,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -3053,21 +3053,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 1 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3085,13 +3085,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB4_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -3100,17 +3100,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 2 - ThreadTileA: 12 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3129,22 +3129,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -3171,20 +3171,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT112x320x32_MI16G4GEHbAdPUcJY9T4f4Vw8xA9HnOy9eBCKZR-rA6UYjQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1vHhtns6gPgv2zdnViNHWKe69UlMziiY8ogzUIM-cXLA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3195,13 +3195,13 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3213,34 +3213,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x320x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123904 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 123904 - LdsNumElementsAlignedA: 16128 - LdsNumElementsAlignedB: 42240 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16128 - LdsOffsetB_Blk: 81664 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16128 - LdsOffsetMetadata_Blk: 81664 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3248,8 +3248,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -3261,15 +3261,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [7, 5] - MIWaveTileA: 7 - MIWaveTileB: 5 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 112 - MacroTile1: 320 - MacroTileA: 112 - MacroTileB: 320 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3283,28 +3283,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 0 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 7 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 140 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 14 - NumLoadsB: 10 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3313,7 +3313,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3322,32 +3322,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x320x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 5 - ThreadTileA: 28 - ThreadTileB: 5 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3365,16 +3365,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3408,7 +3408,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xKd3xOQJnZvH7NER92h_FWeDLMN6llBgbuSKPfS-dh_8= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16axuOnARtwf3Zw_BH37e0vrXw3uX3YI2sxTIy41cp-BQ= BufferLoad: true BufferStore: true CUCount: null @@ -3418,7 +3418,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -3438,7 +3438,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3450,34 +3450,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 116224 + LdsNumBytes: 135168 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 67584 LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3487,8 +3487,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3498,15 +3498,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3526,22 +3526,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 6 - NonTemporalD: 0 + NonTemporalA: 1 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3559,38 +3559,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3602,29 +3602,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -3633,7 +3633,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -3645,7 +3645,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16_yRrZ54pxRgRukp1Qzk0QYQwx3MBBzFzyA0wCMrV_ho= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1j604NGXToiBb5XPUVM0hkt4WXLh6NV34LIceG_D6gTg= BufferLoad: true BufferStore: true CUCount: null @@ -3665,7 +3665,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -3675,7 +3675,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3687,7 +3687,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -3697,24 +3697,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3736,14 +3736,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3763,22 +3763,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 + NonTemporalA: 3 + NonTemporalB: 1 NonTemporalC: 6 NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3786,7 +3786,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -3796,7 +3796,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -3805,12 +3805,12 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -3818,10 +3818,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3839,8 +3839,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -3848,7 +3848,7 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3870,7 +3870,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -3882,17 +3882,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x1tj_kzs63yw3mFQwAEJiL4iTevb3pL7nbZor8PNO8HKQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1leQ8mjkIFyIZuDPwBWpvxiJZrseNgqKruLAXi8hCJNQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -3912,7 +3912,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3924,34 +3924,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 46592 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3961,10 +3961,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -3972,15 +3972,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4000,22 +4000,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4033,38 +4033,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4076,23 +4076,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -4119,7 +4119,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x12f43K0NWsDi5cNJjR7R2JTcg_yqtfEHj2PuqgoJwtDk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xt2IWy6oc3iaHRd0qwAxR-vlfdl6TmzLLHgr_zeUr1t0= BufferLoad: true BufferStore: true CUCount: null @@ -4139,7 +4139,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -4161,7 +4161,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -4172,25 +4172,25 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 17408 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4198,11 +4198,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -4210,23 +4210,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4237,21 +4237,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -4260,7 +4260,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -4270,13 +4270,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -4285,17 +4285,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4314,13 +4314,13 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -4344,7 +4344,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -4356,12 +4356,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3okxSER2kK2UA9ov1OBTPhQlusC6KlbBX40fJhWBRxnQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI162pz3v-E27-meies96c7sXE26vltpCAQ0pdpY2inCyGg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -4386,7 +4386,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4398,7 +4398,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -4408,26 +4408,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 69632 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 165888 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4435,11 +4435,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -4447,23 +4447,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4474,22 +4474,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalC: 7 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4498,7 +4498,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4507,7 +4507,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4516,23 +4516,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4550,16 +4550,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4593,7 +4593,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1vHhtns6gPgv2zdnViNHWKe69UlMziiY8ogzUIM-cXLA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16Rai2BMU5qUNE8ec14Xy8sjzYfYE4wTy8B1l9bpYhNTM= BufferLoad: true BufferStore: true CUCount: null @@ -4623,7 +4623,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4635,7 +4635,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -4645,24 +4645,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -4684,14 +4684,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4712,21 +4712,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4735,7 +4735,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4744,21 +4744,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -4766,10 +4766,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4787,16 +4787,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4809,7 +4809,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -4830,20 +4830,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16axuOnARtwf3Zw_BH37e0vrXw3uX3YI2sxTIy41cp-BQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI16_hmfRslJ7_UM9E5_Eh9cjKTPKXxOaGMYtgTpwcJhKMM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4860,7 +4860,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4872,34 +4872,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 125952 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 125952 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -4907,10 +4907,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -4921,14 +4921,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4942,27 +4942,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 + NonTemporalA: 3 + NonTemporalB: 0 NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 5 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -4972,7 +4972,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4981,21 +4981,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -5003,16 +5003,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -5024,7 +5024,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -5032,21 +5032,21 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 + WorkGroupMapping: 1 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -5067,20 +5067,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1j604NGXToiBb5XPUVM0hkt4WXLh6NV34LIceG_D6gTg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x160x32_MI16seqDTzLmKwfO-jGw_7q0e0kRLQ9kjh4yCHvuyvaH2cs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5097,7 +5097,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5109,7 +5109,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -5119,24 +5119,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 29568 + LdsNumElementsAlignedB: 21120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 29568 + LdsOffsetB_Blk: 95104 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 29568 + LdsOffsetMetadata_Blk: 95104 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5144,8 +5144,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -5158,14 +5158,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [7, 5] + MIWaveTileA: 7 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 224 + MacroTile1: 160 + MacroTileA: 224 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5179,29 +5179,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 1 NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 140 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 7 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 5 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -5218,7 +5218,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5227,8 +5227,8 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -5240,10 +5240,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 28 + ThreadTile1: 5 + ThreadTileA: 28 + ThreadTileB: 5 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5261,7 +5261,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -5304,20 +5304,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1leQ8mjkIFyIZuDPwBWpvxiJZrseNgqKruLAXi8hCJNQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xN2l8jPNpvUrR9ihG-g4bbj4txPWtM9VLB2cEdM2wQDo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5334,7 +5334,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5346,34 +5346,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 99328 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5381,12 +5381,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5395,14 +5395,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5416,28 +5416,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5455,7 +5455,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5465,7 +5465,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -5477,10 +5477,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5498,7 +5498,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -5511,10 +5511,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5541,7 +5541,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xt2IWy6oc3iaHRd0qwAxR-vlfdl6TmzLLHgr_zeUr1t0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1KDrJ4Ic4uP1T91fz80L-1p9uxPjaRg8FouAoxpGOl90= BufferLoad: true BufferStore: true CUCount: null @@ -5561,7 +5561,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -5571,7 +5571,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5583,7 +5583,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -5593,26 +5593,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 30720 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -5620,11 +5620,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -5632,23 +5632,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -5662,18 +5662,18 @@ NonTemporalA: 2 NonTemporalB: 3 NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -5682,7 +5682,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -5692,38 +5692,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -5735,14 +5735,14 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -5778,7 +5778,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xUaGw_W8LDx3RieUoB0YjL0FezgOAUjz28DCpyFICrg4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3u9DB_8S6rR7nconKZLZIT-er2SW2W92RF0q4O62ZiGo= BufferLoad: true BufferStore: true CUCount: null @@ -5798,7 +5798,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -5820,34 +5820,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -5868,15 +5868,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5896,30 +5896,30 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 5 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -5929,7 +5929,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5943,24 +5943,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 + StreamKXCCMapping: 0 + SubGroup0: 2 SubGroup1: 64 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -5973,15 +5973,15 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6015,7 +6015,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xnPDI4eDL8B8YkIMGYr-3DuqBaS61ghWLUzLSEYUQwcQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3Jm5CiGdfbFXujnDqBqMXTdu4rLuafV0fP7zUlp_z2K4= BufferLoad: true BufferStore: true CUCount: null @@ -6025,17 +6025,17 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -6045,7 +6045,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6057,106 +6057,106 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC5_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 - LVCA: 16 - LVCB: 16 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -6166,32 +6166,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC5_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6209,23 +6209,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6252,27 +6252,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI167teIb582yCf_SJBIlrVLw1F7Ht3BtRFeb0kjf6Rcpyk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3csRS0Q7OBQrJTgMfhnmBap6_YbZwht-0nasXx6jerRk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -6282,7 +6282,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6294,106 +6294,106 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 - LVCA: 16 - LVCB: 16 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -6403,38 +6403,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6446,23 +6446,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6489,7 +6489,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI162pz3v-E27-meies96c7sXE26vltpCAQ0pdpY2inCyGg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1G-Hk-cvpdoZpl53xXJtdFhfZ8BYYo4TdUsDuo483nkI= BufferLoad: true BufferStore: true CUCount: null @@ -6519,7 +6519,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6531,7 +6531,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -6541,24 +6541,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 69632 + LdsBytesNoAmax: 14336 LdsInitCVgprs: false - LdsNumBytes: 69632 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 14336 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 165888 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 69632 - LdsOffsetMetadata_Blk: 165888 + LdsOffsetMetadata: 14336 + LdsOffsetMetadata_Blk: 21504 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -6580,14 +6580,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6607,22 +6607,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 3 NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalC: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6640,7 +6640,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -6649,8 +6649,8 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -6662,10 +6662,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6683,16 +6683,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6726,7 +6726,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16Rai2BMU5qUNE8ec14Xy8sjzYfYE4wTy8B1l9bpYhNTM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1-xkfgfCFtFLl4KyFyWYpOSiCNaOYMrDlymFG759qbfs= BufferLoad: true BufferStore: true CUCount: null @@ -6756,7 +6756,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6768,7 +6768,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -6778,24 +6778,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 30720 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -6817,14 +6817,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6844,22 +6844,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6877,21 +6877,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -6899,16 +6899,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6920,16 +6920,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6942,7 +6942,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -6963,20 +6963,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI16_hmfRslJ7_UM9E5_Eh9cjKTPKXxOaGMYtgTpwcJhKMM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3fwCm5jm6HmwYFF2F9NEH2-i6ut4jdo_U1ZXcaGHnHUQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7005,99 +7005,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 125952 + LdsBytesNoAmax: 28864 LdsInitCVgprs: false - LdsNumBytes: 125952 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 + LdsNumBytes: 28864 + LdsNumElementsAlignedA: 4160 + LdsNumElementsAlignedB: 8320 + LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4160 + LdsOffsetB_Blk: 20544 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 91136 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4160 + LdsOffsetMetadata_Blk: 20544 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 8] - MIWaveTileA: 5 - MIWaveTileB: 8 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 5 - NumLoadsB: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -7105,7 +7105,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7114,32 +7114,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 8 - ThreadTileA: 20 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7158,15 +7158,15 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7188,7 +7188,7 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -7200,7 +7200,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI32WgES1xrDYvB_Vsiz8nGLdWqeSq_GbrDnca8tIfV9V44= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x1qqLf9V2sWUhTTjo08OvLbGEZOd8IOsdSVoXLWBlvt8= BufferLoad: true BufferStore: true CUCount: null @@ -7210,17 +7210,17 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -7242,98 +7242,98 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41472 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 41472 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41472 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 1] - MIWaveTileA: 5 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7341,7 +7341,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -7351,7 +7351,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -7360,23 +7360,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 1 - ThreadTileA: 80 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7395,7 +7395,7 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -7407,10 +7407,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7437,7 +7437,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x160x32_MI16seqDTzLmKwfO-jGw_7q0e0kRLQ9kjh4yCHvuyvaH2cs= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x2tRUZll8ei_2fEcBgvm8QHlf-NZKX4GlNtbfPsV0B7Y= BufferLoad: true BufferStore: true CUCount: null @@ -7447,7 +7447,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -7479,34 +7479,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 116224 LdsInitCVgprs: false LdsNumBytes: 116224 - LdsNumElementsAlignedA: 29568 - LdsNumElementsAlignedB: 21120 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 29568 - LdsOffsetB_Blk: 95104 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 29568 - LdsOffsetMetadata_Blk: 95104 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7516,8 +7516,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -7528,14 +7528,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [7, 5] - MIWaveTileA: 7 - MIWaveTileB: 5 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 160 - MacroTileA: 224 - MacroTileB: 160 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7556,21 +7556,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 140 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 7 - NumLoadsB: 5 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7579,7 +7579,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7588,21 +7588,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -7610,16 +7610,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 5 - ThreadTileA: 28 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -7632,7 +7632,7 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -7640,14 +7640,14 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7664,7 +7664,7 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -7674,7 +7674,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xN2l8jPNpvUrR9ihG-g4bbj4txPWtM9VLB2cEdM2wQDo= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16DbJizEd-UmOez0mMSJd0N_uM7eLvQVu0Z44HBDIDLQQ= BufferLoad: true BufferStore: true CUCount: null @@ -7684,7 +7684,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -7697,6 +7697,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7704,7 +7705,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7713,37 +7714,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7753,10 +7754,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7765,14 +7766,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7792,22 +7793,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7825,21 +7826,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -7847,10 +7848,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7860,7 +7861,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -7868,23 +7870,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7895,13 +7897,14 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -7911,29 +7914,30 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1KDrJ4Ic4uP1T91fz80L-1p9uxPjaRg8FouAoxpGOl90= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16hJIai0BInLZV7hgR5cDeaVnCphwevySseItKMzY2S58= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7941,7 +7945,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7950,37 +7954,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB7_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 - LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 9216 - LdsNumElementsAlignedMetadata: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7988,10 +7992,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -8002,14 +8006,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8023,28 +8027,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 4 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8052,7 +8056,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -8062,17 +8066,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 33 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB7_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -8084,20 +8088,21 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8105,8 +8110,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -8118,10 +8123,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -8132,13 +8137,14 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -8148,12 +8154,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3u9DB_8S6rR7nconKZLZIT-er2SW2W92RF0q4O62ZiGo= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI162FQf5x0Ic9M0LiIiM9wZ1HB5yGNVuG3Z50nYqhZpdHA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -8168,9 +8174,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8178,7 +8185,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8187,39 +8194,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -8227,35 +8234,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8266,31 +8273,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8299,32 +8306,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 34 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 64 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8334,7 +8341,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8342,16 +8350,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8364,18 +8372,19 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -8385,7 +8394,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3Jm5CiGdfbFXujnDqBqMXTdu4rLuafV0fP7zUlp_z2K4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xA0Nndy-Gm0_-9jnJPOikGUN_FqV0gpTqShRRDXkv1uQ= BufferLoad: true BufferStore: true CUCount: null @@ -8395,19 +8404,20 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8415,7 +8425,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8424,109 +8434,109 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 0 + NonTemporalB: 7 NonTemporalC: 7 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -8536,32 +8546,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 35 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8571,48 +8581,50 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -8622,7 +8634,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3csRS0Q7OBQrJTgMfhnmBap6_YbZwht-0nasXx6jerRk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI162WYI2MxHIrVkq3FQOShW9mBomKJ0mhyfVvurs93vJTE= BufferLoad: true BufferStore: true CUCount: null @@ -8642,9 +8654,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8652,7 +8665,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8661,39 +8674,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -8701,35 +8714,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8740,31 +8753,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8773,32 +8786,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 36 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + StreamKXCCMapping: 5 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8808,7 +8821,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8816,16 +8830,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8843,13 +8857,14 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -8859,20 +8874,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1G-Hk-cvpdoZpl53xXJtdFhfZ8BYYo4TdUsDuo483nkI= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16z6K3Sb_rajFl7CvRhVdmX-td587lL_0kZQhA44LAkbg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -8882,6 +8897,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8889,7 +8905,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8898,37 +8914,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14336 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 14336 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14336 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8936,10 +8952,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -8950,14 +8966,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8971,28 +8987,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 6 + NonTemporalB: 7 + NonTemporalC: 1 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9001,7 +9017,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9010,21 +9026,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 37 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -9032,10 +9048,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9045,7 +9061,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9053,23 +9070,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -9080,13 +9097,14 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -9096,7 +9114,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1-xkfgfCFtFLl4KyFyWYpOSiCNaOYMrDlymFG759qbfs= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16S2etZF6YGoCywFVWpPuTkl82pskkkPSvkb_V_rvYnYM= BufferLoad: true BufferStore: true CUCount: null @@ -9119,6 +9137,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -9126,7 +9145,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9135,10 +9154,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -9148,24 +9167,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -9187,14 +9206,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9214,22 +9233,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9247,21 +9266,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 38 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -9269,20 +9288,21 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9290,16 +9310,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9317,13 +9337,14 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -9333,20 +9354,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3fwCm5jm6HmwYFF2F9NEH2-i6ut4jdo_U1ZXcaGHnHUQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16zhQEOYLLGxWCuOEX40LQg-IfZTSUsdq7e3ct5BY82X4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9356,14 +9377,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9372,102 +9394,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28864 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 28864 - LdsNumElementsAlignedA: 4160 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4160 - LdsOffsetB_Blk: 20544 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4160 - LdsOffsetMetadata_Blk: 20544 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9475,7 +9497,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9484,32 +9506,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 39 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9519,7 +9541,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9527,16 +9550,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9549,18 +9572,19 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -9570,29 +9594,30 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x1qqLf9V2sWUhTTjo08OvLbGEZOd8IOsdSVoXLWBlvt8= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xhzz0mUXjdKncv9yWH36CWKfQiJUxDnvM6oldFtHiGPQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -9609,51 +9634,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 4 - LoopUnroll: 128 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -9665,39 +9690,39 @@ MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 7 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -9711,7 +9736,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -9721,31 +9746,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 40 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -9756,12 +9781,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -9771,18 +9797,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9791,13 +9817,14 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -9807,29 +9834,30 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x2tRUZll8ei_2fEcBgvm8QHlf-NZKX4GlNtbfPsV0B7Y= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xj7U-zgwpd4zPGOHREedIwdwjVu8pyLWNmqSZsXIQfiQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -9846,51 +9874,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -9902,45 +9930,45 @@ MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalC: 1 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9948,7 +9976,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -9958,7 +9986,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -9973,16 +10001,16 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -9993,12 +10021,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -10008,18 +10037,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10028,10 +10057,11 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -10044,20 +10074,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI162FQf5x0Ic9M0LiIiM9wZ1HB5yGNVuG3Z50nYqhZpdHA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x256_MI16x9N4FL5Gr-S5lZKFQp99NVD859UtIa5XyMClMxXi3-_8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10067,7 +10097,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10075,7 +10105,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10087,45 +10117,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB7_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -10135,15 +10165,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10157,27 +10187,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 6 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 0 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -10187,7 +10217,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10196,32 +10226,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB7_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10240,38 +10270,38 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -10284,20 +10314,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI162WYI2MxHIrVkq3FQOShW9mBomKJ0mhyfVvurs93vJTE= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI16xDXqX3nOI1ws82SuzK10Xb7Lyblmqqz7YODwj8fdf6Ec= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10327,7 +10357,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -10337,24 +10367,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 152064 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 152064 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 76032 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 84480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 84480 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -10362,8 +10392,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -10375,15 +10405,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10397,28 +10427,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalB: 5 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 + NumElementsPerBatchStore: 16 NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10436,32 +10466,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10487,9 +10517,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10502,7 +10532,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -10510,9 +10540,9 @@ numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -10524,7 +10554,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16S2etZF6YGoCywFVWpPuTkl82pskkkPSvkb_V_rvYnYM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x320x32_MI16xEDT-TS7XsVhCqRrDXPaaM9Dw-865cKdJ2bN2UQP64eM= BufferLoad: true BufferStore: true CUCount: null @@ -10547,7 +10577,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10567,7 +10597,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA2_NTB5_NTC0_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -10578,23 +10608,23 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 51200 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -10607,7 +10637,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -10615,15 +10645,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10644,21 +10674,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10676,38 +10706,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 44 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA2_NTB5_NTC0_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10717,19 +10747,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10738,7 +10768,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10747,7 +10777,7 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -10764,20 +10794,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16zhQEOYLLGxWCuOEX40LQg-IfZTSUsdq7e3ct5BY82X4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16x7eJMtzuep7ti0IbbweWe307G-lT2FmDeeBlMvdoLAfg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10787,15 +10817,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10807,34 +10837,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 99328 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -10842,10 +10872,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -10856,14 +10886,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10877,28 +10907,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 + NonTemporalA: 2 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10907,7 +10937,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10916,21 +10946,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 45 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -10938,10 +10968,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10957,42 +10987,42 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -11004,7 +11034,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xR68Z4R7jyAOkgwfr2W7Y3XG7smz3NAVO_kD672DuYEg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xByhPKgVjq4CDyWj-cbHC-i4QzDiSS4uWjZj1qyr1xvg= BufferLoad: true BufferStore: true CUCount: null @@ -11024,18 +11054,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11047,34 +11077,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 100352 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11095,15 +11125,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11123,22 +11153,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11146,7 +11176,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -11156,7 +11186,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 46 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -11165,23 +11195,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11197,19 +11227,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11244,7 +11274,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16x7eJMtzuep7ti0IbbweWe307G-lT2FmDeeBlMvdoLAfg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xd-sQFx5RImj-B93gHPT59pdW-PfpHkRagGWfDrPaTnY= BufferLoad: true BufferStore: true CUCount: null @@ -11254,7 +11284,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -11275,7 +11305,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11287,34 +11317,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11324,8 +11354,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -11336,14 +11366,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11363,21 +11393,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 + NonTemporalA: 1 + NonTemporalB: 7 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -11396,17 +11426,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 47 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -11418,10 +11448,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11440,8 +11470,8 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -11449,20 +11479,20 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -11484,7 +11514,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xByhPKgVjq4CDyWj-cbHC-i4QzDiSS4uWjZj1qyr1xvg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1-95uwA7thjYZN8Q0bg1wWhpCED9_VDGCoINGlVsybm0= BufferLoad: true BufferStore: true CUCount: null @@ -11515,7 +11545,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11527,7 +11557,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -11537,24 +11567,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100352 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 100352 - LdsNumElementsAlignedA: 25600 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11576,13 +11606,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 2] - MIWaveTileA: 5 + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 160 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -11603,21 +11633,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -11636,7 +11666,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 48 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -11646,7 +11676,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -11658,9 +11688,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 20 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -11677,10 +11707,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -11689,7 +11719,7 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11698,7 +11728,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -11712,7 +11742,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -11724,7 +11754,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI16x1R4sVsBWCeh56t4_hhONAiKYW1myOGPmVq0nXhjyJNZo= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32NrolHpaoNMGubHMXg2QZmfmFTUHLarzXy0RPUp34mNo= BufferLoad: true BufferStore: true CUCount: null @@ -11767,7 +11797,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -11778,25 +11808,25 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14336 + LdsBytesNoAmax: 100352 LdsInitCVgprs: false - LdsNumBytes: 14336 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14336 - LdsOffsetMetadata_Blk: 25600 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -11804,11 +11834,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -11816,23 +11846,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -11843,22 +11873,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 + NonTemporalA: 1 + NonTemporalB: 1 NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11876,13 +11906,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 49 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -11890,24 +11920,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11917,19 +11947,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11942,7 +11972,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -12432,6 +12462,246 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI16ZwWemFSm3p4igCb8DLsnt6R0foWhVdW4rDKTnbvNCrM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true @@ -12595,7 +12865,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 52 + SolutionIndex: 53 SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] @@ -12684,7 +12954,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1Tt22kH2e9CMY6k_Y8O_l57cXp8oRQ2KAhNqC8vG9-is= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI1632V6Pxwe_725MnjTV9-vmbwgaM0uPWz-Wc0cWfNZEtw= BufferLoad: true BufferStore: true CUCount: null @@ -12715,7 +12985,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12727,34 +12997,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9728 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 9728 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9728 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -12767,7 +13037,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12775,15 +13045,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12803,23 +13073,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12835,8 +13105,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 53 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -12846,28 +13116,28 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -12880,16 +13150,16 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -12924,7 +13194,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1tPRLVkTDR1Eck8xeuIldFLH6hKJV_G81Iz6F9AruisE= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16q_1WHADry2cTxAoNmb44qR0nw6Q5GfHEK9JlilAlfio= BufferLoad: true BufferStore: true CUCount: null @@ -12944,7 +13214,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -12955,7 +13225,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12967,34 +13237,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 8 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9216 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 9216 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 20992 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 91136 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -13015,15 +13285,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13043,31 +13313,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 0 NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 5 NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13075,39 +13345,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 54 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13117,19 +13387,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -13138,7 +13408,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -13164,7 +13434,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x448x32_MI16kicIlK4eREQNKFD_4qwLZk8EPghmo4TEg7fAcQxRmDY= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1Tt22kH2e9CMY6k_Y8O_l57cXp8oRQ2KAhNqC8vG9-is= BufferLoad: true BufferStore: true CUCount: null @@ -13184,7 +13454,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -13195,7 +13465,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13207,34 +13477,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 89088 + LdsBytesNoAmax: 9728 LdsInitCVgprs: false - LdsNumBytes: 89088 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 71680 + LdsNumBytes: 9728 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 148480 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 89088 - LdsOffsetMetadata_Blk: 148480 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 21504 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -13247,7 +13517,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13255,15 +13525,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 7] - MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 448 - MacroTileA: 128 - MacroTileB: 448 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13283,30 +13553,30 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 + NonTemporalA: 5 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 4 - NumLoadsB: 14 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 14 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -13315,39 +13585,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 55 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 7 - ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13357,17 +13627,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -13392,7 +13662,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -13404,12 +13674,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI32xLh-ycjs1oSwZbXiz2hnStK9M-0nCgCFZS7lNjPV_78M= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1tPRLVkTDR1Eck8xeuIldFLH6hKJV_G81Iz6F9AruisE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -13424,9 +13694,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -13443,39 +13714,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 8 + LSPB: 8 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 109056 + LdsBytesNoAmax: 9216 LdsInitCVgprs: false - LdsNumBytes: 109056 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 9216 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -13483,35 +13754,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] + MIWaveGroup: [1, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -13522,30 +13793,30 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -13554,48 +13825,49 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 56 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC32_WGMXCCGn1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 2 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -13605,9 +13877,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -13616,7 +13888,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -13625,11 +13897,12 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -13641,20 +13914,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xB_DE9RVLGbG_PwgSFrYUPdM2k3sTJC_ftD4lxU9bY6g= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x48x32_MI16x77c5vCqhF37wngAzGqfnFLKmA0iPzLeWaJiRYnuXPkU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13664,14 +13937,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13680,102 +13954,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x48x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 150528 + LdsBytesNoAmax: 33280 LdsInitCVgprs: false - LdsNumBytes: 150528 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 66560 + LdsNumBytes: 33280 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 75264 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 83968 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 83968 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 512 - MacroTileA: 64 - MacroTileB: 512 + MacroTile0: 160 + MacroTile1: 48 + MacroTileA: 160 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 16 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 10 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 256 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 3 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13783,7 +14057,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13791,33 +14065,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 57 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x48x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -13827,24 +14101,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -13853,19 +14128,20 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true @@ -13878,7 +14154,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xNPMvn-evBNCDHAzhSPgPljENZ_iAGw5fkD0ulDNYMWc= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x448x32_MI16kicIlK4eREQNKFD_4qwLZk8EPghmo4TEg7fAcQxRmDY= BufferLoad: true BufferStore: true CUCount: null @@ -13898,9 +14174,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -13908,7 +14185,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13917,10 +14194,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -13930,26 +14207,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 76288 + LdsBytesNoAmax: 89088 LdsInitCVgprs: false - LdsNumBytes: 76288 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 89088 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 71680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 76288 - LdsOffsetMetadata_Blk: 139776 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 89088 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -13957,11 +14234,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -13969,23 +14246,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 512 - MacroTileA: 64 - MacroTileB: 512 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -13996,22 +14273,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 2 - NumLoadsB: 16 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14019,8 +14296,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14028,60 +14305,61 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 58 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 7 ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTileB: 7 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14090,20 +14368,21 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -14115,7 +14394,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x384x32_MI32xJv85LAZjXvrheXhR6Pb2-1BpR8Zhns4NnJ21cIAl2Hw= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32WmWOeC1HZXF52s2qw5enQVIcphzfzkpjTKp5gB44mAw= BufferLoad: true BufferStore: true CUCount: null @@ -14138,6 +14417,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -14145,7 +14425,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14154,10 +14434,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -14167,24 +14447,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 129536 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 129536 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 83968 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -14197,7 +14477,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -14205,15 +14485,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 384 - MacroTileA: 64 - MacroTileB: 384 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14233,22 +14513,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 + NonTemporalA: 4 + NonTemporalB: 1 NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 2 - NumLoadsB: 12 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14265,22 +14545,261 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 59 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI32xLh-ycjs1oSwZbXiz2hnStK9M-0nCgCFZS7lNjPV_78M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 109056 + LdsInitCVgprs: false + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 8 SubGroup0: 2 SubGroup1: 128 SubGroupA: 2 @@ -14289,15 +14808,15 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14310,14 +14829,14 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 + WorkGroupMapping: 6 WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -14352,7 +14871,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xW4r92vEXVihDdHo-lHvJ0uNEpIQK9BpFRUZ08nMbzCs= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xB_DE9RVLGbG_PwgSFrYUPdM2k3sTJC_ftD4lxU9bY6g= BufferLoad: true BufferStore: true CUCount: null @@ -14362,7 +14881,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -14376,7 +14895,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -14394,72 +14913,72 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115712 + LdsBytesNoAmax: 150528 LdsInitCVgprs: false - LdsNumBytes: 115712 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 150528 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 66560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 75264 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 83968 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14470,22 +14989,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 2 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14502,33 +15021,3133 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 60 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xNPMvn-evBNCDHAzhSPgPljENZ_iAGw5fkD0ulDNYMWc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 76288 + LdsInitCVgprs: false + LdsNumBytes: 76288 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 76288 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 5 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x384x32_MI32xJv85LAZjXvrheXhR6Pb2-1BpR8Zhns4NnJ21cIAl2Hw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 129536 + LdsInitCVgprs: false + LdsNumBytes: 129536 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 55296 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 384 + MacroTileA: 64 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 2 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xW4r92vEXVihDdHo-lHvJ0uNEpIQK9BpFRUZ08nMbzCs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115712 + LdsInitCVgprs: false + LdsNumBytes: 115712 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 5 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xwGhVm6bClKczrRVvyM1JDrgb5E0uKGXdH1TpIh1TL0o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 17408 + LdsInitCVgprs: false + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x16x32_MI16x1TWdBy3MnQQKMOSifSRrwSeieAjAz_Ipy8XM9t_apH8I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 29504 + LdsInitCVgprs: false + LdsNumBytes: 29504 + LdsNumElementsAlignedA: 10560 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 10560 + LdsOffsetB_Blk: 26944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 10560 + LdsOffsetMetadata_Blk: 26944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 16 + MacroTileA: 80 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 10 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC32_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 7 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x7fd2kUwjm75D0kEujcLkV3_GTKT5wWnTcae2Fu7jWuw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 86016 + LdsInitCVgprs: false + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x80x128_MI16xEh4DCgEFCt-afK9cUIQCk_5N_QizF04nbCdMOaWqSgw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 156672 + LdsInitCVgprs: false + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 43520 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 113152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 113152 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 80 + MacroTileA: 64 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 2 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xIjYsJ7nPzSNt9gcCFNyYcDjxOwbmoQKbvftRgAY1G4I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 86016 + LdsInitCVgprs: false + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xG8s9vcohdFWaEPW_OKAxb_cYdQ6B4AQZ2nx9kjNOYto= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 86016 + LdsInitCVgprs: false + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x34z5jA9IIcOhaYTxrREvShaRXUEMAbqXkdSTqaoGUr0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 86016 + LdsInitCVgprs: false + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 2 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x9MB_BItjxFkx0YR1b98tLWirgDOvhTRoD-0flsTcQhA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 86016 + LdsInitCVgprs: false + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 8 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI32o1cp8AHeO1wh-BhDUt3-ptcU4Mtb1M0g3Gvr_oirH9Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 101376 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 262144 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 295936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 135168 + LdsOffsetMetadata_Blk: 295936 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 24 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 24 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: 1 + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI323I4N4iBo8DW3RIRd0BHqSQeGBeqa7daq5LoZFfvEd3I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: true + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 135168 + LdsInitCVgprs: false + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 101376 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 262144 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 295936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 135168 + LdsOffsetMetadata_Blk: 295936 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 2 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 24 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 24 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14537,34 +18156,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseCustomMainLoopSchedule: 1 + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -14573,13 +18193,14 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -14589,7 +18210,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xwGhVm6bClKczrRVvyM1JDrgb5E0uKGXdH1TpIh1TL0o= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1Fd3Wgq88OZ1PaojzfzjpDsCp8WKZiHDJo9GXAXrgRK0= BufferLoad: true BufferStore: true CUCount: null @@ -14599,7 +18220,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -14612,6 +18233,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -14628,42 +18250,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 17408 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 18432 LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -14679,15 +18301,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14707,23 +18329,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14739,8 +18363,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 61 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC8_WGMXCCGn1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -14755,17 +18379,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14775,31 +18399,35 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -14810,13 +18438,14 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 @@ -14826,19 +18455,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x16x32_MI16x1TWdBy3MnQQKMOSifSRrwSeieAjAz_Ipy8XM9t_apH8I= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x64x64_MI16x1oxplz8DgKAzzHI9atKMOCqO_fHvi4aY6eI_59fmy5x0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false @@ -14849,9 +18478,10 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -14865,37 +18495,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29504 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 29504 - LdsNumElementsAlignedA: 10560 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10560 - LdsOffsetB_Blk: 26944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10560 - LdsOffsetMetadata_Blk: 26944 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -14903,10 +18533,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -14916,15 +18546,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [5, 1] - MIWaveTileA: 5 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 16 - MacroTileA: 80 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14944,7 +18574,7 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 @@ -14952,15 +18582,17 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14968,7 +18600,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14976,32 +18608,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 62 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 + StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 20 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -15012,12 +18644,16 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -15027,29 +18663,30 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false @@ -15063,7 +18700,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x7fd2kUwjm75D0kEujcLkV3_GTKT5wWnTcae2Fu7jWuw= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x64_MI16HbQjL6Xiy4_bwYKGnfEzEBO73K7v6GCjIcyS997kha4= BufferLoad: true BufferStore: true CUCount: null @@ -15073,7 +18710,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15086,6 +18723,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -15093,7 +18731,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15102,37 +18740,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 79872 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -15142,8 +18780,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -15154,14 +18792,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15181,23 +18819,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15213,22 +18853,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 63 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSyncOpt: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -15236,10 +18876,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15248,34 +18888,38 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 2 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -15284,11 +18928,12 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -15300,17 +18945,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x80x128_MI16xEh4DCgEFCt-afK9cUIQCk_5N_QizF04nbCdMOaWqSgw= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16x0cYNusgHgF3Co_ShFYNVgf_9fASJQBYDTRah_BV_zEY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15320,7 +18965,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -15331,7 +18976,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15343,34 +18988,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 156672 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 156672 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 43520 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 78336 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 113152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 113152 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -15380,26 +19025,26 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 5] - MIWaveTileA: 1 - MIWaveTileB: 5 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 80 - MacroTileA: 64 - MacroTileB: 80 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15420,29 +19065,31 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 10 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -15451,33 +19098,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 64 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 5 - ThreadTileA: 4 - ThreadTileB: 5 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15486,35 +19133,38 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -15528,7 +19178,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -15540,20 +19190,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xIjYsJ7nPzSNt9gcCFNyYcDjxOwbmoQKbvftRgAY1G4I= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x64_MI16MWaFtORgD4eC8ES5PuaeS70bQdnSFwME3aBH6fKhp6A= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -15571,7 +19221,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15583,34 +19233,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 152064 LdsInitCVgprs: false - LdsNumBytes: 86016 + LdsNumBytes: 152064 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedB: 42240 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 + LdsOffsetA_Blk: 76032 LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetB_Blk: 109824 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 109824 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -15618,12 +19268,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -15632,14 +19282,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15653,29 +19303,31 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 8 - NumLoadsB: 12 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularB: 10 NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 10 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15691,22 +19343,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 65 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSwapAddr: true + StoreSyncOpt: 4 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -15714,29 +19366,32 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -15749,10 +19404,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -15766,8 +19421,8 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false @@ -15780,17 +19435,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xG8s9vcohdFWaEPW_OKAxb_cYdQ6B4AQZ2nx9kjNOYto= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xFyWqcs6KVFJkQ8kLhxkp503bjalLMrD-hQMHlzpWf2g= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15800,7 +19455,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -15811,7 +19466,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15823,34 +19478,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 86016 + LdsNumBytes: 51200 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 + LdsOffsetA_Blk: 65536 LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -15860,10 +19515,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -15872,14 +19527,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15899,30 +19554,32 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 12 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -15931,13 +19588,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 66 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -15946,7 +19603,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -15954,10 +19611,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15966,35 +19623,38 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 + WorkGroupMapping: 32 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16012,7 +19672,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16020,18 +19680,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x34z5jA9IIcOhaYTxrREvShaRXUEMAbqXkdSTqaoGUr0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3ZQt4emcWG8guHWikbv6OxFJy790l58gtvM3nfZjMIJE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -16042,16 +19702,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16060,11 +19720,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 LSCA: 128 LSCB: 128 LSPA: 8 @@ -16073,37 +19733,37 @@ LVCB: 32 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16111,15 +19771,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16140,21 +19800,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16171,70 +19831,71 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 67 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16252,7 +19913,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16260,7 +19921,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x9MB_BItjxFkx0YR1b98tLWirgDOvhTRoD-0flsTcQhA= BufferLoad: true BufferStore: true CUCount: null @@ -16270,8 +19930,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -16282,16 +19942,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16300,50 +19960,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16351,15 +20011,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16380,21 +20040,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16403,7 +20063,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16411,33 +20071,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 68 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16446,35 +20107,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16492,7 +20153,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16500,18 +20161,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI32o1cp8AHeO1wh-BhDUt3-ptcU4Mtb1M0g3Gvr_oirH9Q= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3e4Dw_hz57yPEZN_qoLaorfGepQNCz75gt6VQs5_mgZo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 256 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -16520,18 +20181,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16540,75 +20201,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 101376 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 262144 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 295936 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 135168 - LdsOffsetMetadata_Blk: 295936 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -16619,22 +20280,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 8 - NumLoadsB: 24 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 24 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16642,7 +20303,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -16651,33 +20312,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 69 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16686,43 +20348,43 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -16732,7 +20394,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16740,7 +20402,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI323I4N4iBo8DW3RIRd0BHqSQeGBeqa7daq5LoZFfvEd3I= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nB1C3iOYxU_7DUOjuOyODiZ9rVSorIjLm4u2U10uDaw= BufferLoad: true BufferStore: true CUCount: null @@ -16751,7 +20413,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -16760,18 +20422,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16780,11 +20442,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 LSCA: 128 LSCB: 128 LSPA: 8 @@ -16793,62 +20455,62 @@ LVCB: 32 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 101376 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 262144 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 295936 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 135168 - LdsOffsetMetadata_Blk: 295936 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 192 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 192 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -16860,21 +20522,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 - NumLoadsB: 24 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 24 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16882,8 +20544,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16891,33 +20553,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 70 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 16 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16926,35 +20589,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16968,11 +20631,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16980,7 +20643,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1Fd3Wgq88OZ1PaojzfzjpDsCp8WKZiHDJo9GXAXrgRK0= BufferLoad: true BufferStore: true CUCount: null @@ -16990,8 +20652,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -17002,68 +20664,68 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17071,15 +20733,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17101,23 +20763,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -17125,7 +20785,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17133,39 +20793,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 71 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -17173,32 +20834,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -17206,18 +20864,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17225,18 +20883,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1Y2_PBeJGjEXXPI1_8Q1nplFuMWjj1kVwTjC8QiIVYG0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3SwF0bvQxB0RrhRxtPMVt3TyizF16j4vW99jq_X9KpHk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -17247,15 +20905,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -17265,37 +20923,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 8 + LSPB: 8 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -17308,7 +20966,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17316,15 +20974,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17351,18 +21009,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 3 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -17378,33 +21034,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 72 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 3 + ThreadTile1: 1 ThreadTileA: 4 - ThreadTileB: 3 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17418,12 +21075,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -17433,17 +21087,17 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -17458,11 +21112,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17470,7 +21124,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x64_MI161e0FqGLAMztP_dD6xxGdXRut9vwmQsZsotL-hvEenUg= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3YeRtOHUsp9ttyStfNKrKENY_vaUUjqgLCrwM91_2-aY= BufferLoad: true BufferStore: true CUCount: null @@ -17480,10 +21134,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -17492,16 +21146,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17510,37 +21164,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152064 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 152064 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 42240 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 76032 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 109824 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 109824 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -17548,12 +21202,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17561,15 +21215,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17583,31 +21237,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 10 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 - NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 10 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -17623,39 +21275,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 73 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -17663,51 +21316,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17715,7 +21365,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16x0cYNusgHgF3Co_ShFYNVgf_9fASJQBYDTRah_BV_zEY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3cSFHPElhrbZabUpjPK0idMvlUk8E6jLU5EZAOZd0T7g= BufferLoad: true BufferStore: true CUCount: null @@ -17726,7 +21376,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -17735,18 +21385,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17755,37 +21405,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 8 + LSPB: 8 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -17798,7 +21448,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17806,15 +21456,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17836,30 +21486,28 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -17868,37 +21516,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 74 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -17908,32 +21557,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -17948,11 +21594,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17960,7 +21606,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xWU2YXZSq42Zyg4I1M1bqQl8lWMhuX8bgcxNNF-WMw3U= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1I2reK521Z0OJ8WcGapE1-BfPXpcGPiwVna1dA3-Ll3c= BufferLoad: true BufferStore: true CUCount: null @@ -17971,9 +21617,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -17982,16 +21628,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18000,11 +21646,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 LSCA: 256 LSCB: 256 LSPA: 4 @@ -18013,24 +21659,24 @@ LVCB: 64 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115712 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 115712 - LdsNumElementsAlignedA: 33280 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -18038,12 +21684,12 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18052,13 +21698,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -18073,31 +21719,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18113,19 +21757,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 75 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM16_WGMXCC16_WGMXCCGn1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -18135,15 +21779,16 @@ SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -18153,15 +21798,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -18169,16 +21811,16 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 256 _DepthUA: 256 _DepthUB: 256 _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -18186,18 +21828,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18205,20 +21847,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Rdlq7Rc2vP_yhCpcNjpQdCrFtftCCMe2J1b1mDh_IUI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18227,16 +21868,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18245,51 +21886,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16640 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -18297,42 +21938,42 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 NumLoadsB: 4 @@ -18341,8 +21982,6 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18350,7 +21989,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18358,33 +21997,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 76 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18398,51 +22038,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true - AdaptiveGemm: 0 + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18450,18 +22087,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xFyWqcs6KVFJkQ8kLhxkp503bjalLMrD-hQMHlzpWf2g= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -18470,17 +22106,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -18490,37 +22126,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -18530,10 +22166,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18542,14 +22178,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18571,31 +22207,29 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18603,37 +22237,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 77 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -18643,32 +22278,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -18676,14 +22308,14 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -18695,7 +22327,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT8eZn8BuF1Nziw24iNKnDP44-Wc-OfwT54KqMWpiGeWHs= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ULSHV5TvKv-3s7N5kPB29fM38e7xZ96gnPlfW9sr_dw= BufferLoad: true BufferStore: true CUCount: null @@ -18720,7 +22352,7 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel @@ -18738,34 +22370,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 82432 + LdsBytesNoAmax: 13312 LdsInitCVgprs: false - LdsNumBytes: 82432 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 69632 + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 143872 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 82432 - LdsOffsetMetadata_Blk: 143872 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -18786,15 +22418,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 8] - MIWaveTileA: 5 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 512 - MacroTileA: 80 - MacroTileB: 512 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18821,16 +22453,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 5 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 5 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 16 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18846,8 +22478,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 78 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -18863,17 +22495,17 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 8 - ThreadTileA: 20 - ThreadTileB: 8 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18893,13 +22525,13 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -18917,8 +22549,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -18936,17 +22568,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3ZQt4emcWG8guHWikbv6OxFJy790l58gtvM3nfZjMIJE= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Gtebt9rQIpwe9oFGHoWcPG_t3BC0dLSpyLHC5C0Kq0M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -18967,7 +22599,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18979,45 +22611,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 26112 + LdsNumBytes: 34816 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -19027,15 +22659,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19063,14 +22695,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19079,7 +22711,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19087,38 +22719,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 79 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -19133,23 +22765,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -19177,6 +22809,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6V3GWrV8q1KT6qcNF3NRQe6SaKzZRXLdh2CbWinGWLAU= BufferLoad: true BufferStore: true CUCount: null @@ -19207,7 +22840,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19219,8 +22852,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 LSCA: 64 LSCB: 64 LSPA: 16 @@ -19229,24 +22862,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34304 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 34304 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34304 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -19267,15 +22900,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19303,14 +22936,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19327,8 +22960,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 80 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -19338,22 +22971,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -19373,15 +23006,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -19398,8 +23031,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -19417,7 +23050,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3e4Dw_hz57yPEZN_qoLaorfGepQNCz75gt6VQs5_mgZo= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zsxRwkMqBm0-RotUdPfZ2aFGnmO-YpXpopyCt-rV2s4= BufferLoad: true BufferStore: true CUCount: null @@ -19427,7 +23060,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -19460,39 +23093,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50176 + LdsBytesNoAmax: 67072 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50176 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -19508,15 +23141,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19544,14 +23177,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19568,13 +23201,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 81 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -19584,18 +23217,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19615,22 +23248,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -19639,8 +23272,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -19658,17 +23291,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nB1C3iOYxU_7DUOjuOyODiZ9rVSorIjLm4u2U10uDaw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HCXW7Md-kYsZ-1UnmrrC7uG9YBlO7z5sEs9KMCRnZRY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -19689,7 +23322,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19701,45 +23334,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50176 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50176 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -19749,10 +23382,10 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 32 @@ -19786,13 +23419,13 @@ NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19801,7 +23434,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19809,38 +23442,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 82 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -19855,23 +23488,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -19880,8 +23513,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -19899,16 +23532,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68SIh_bnF1917_7343uVm45oe_0GQqSHLfLK0W_wi_u8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -19929,7 +23563,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19941,45 +23575,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 164352 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -19989,10 +23623,10 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 64 @@ -20026,13 +23660,13 @@ NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20041,7 +23675,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20049,34 +23683,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 83 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20095,23 +23729,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -20127,7 +23761,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -20139,17 +23773,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3SwF0bvQxB0RrhRxtPMVt3TyizF16j4vW99jq_X9KpHk= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6oabzLLQwOO03l3MEmNqlhJ6dLm9NnzDdPu-7gnuHCJQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -20170,7 +23804,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20182,45 +23816,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 + LVCA: 32 + LVCB: 32 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -20230,15 +23864,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20266,15 +23900,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20282,7 +23916,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20290,34 +23924,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 84 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20336,23 +23970,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -20361,8 +23995,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -20380,7 +24014,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3YeRtOHUsp9ttyStfNKrKENY_vaUUjqgLCrwM91_2-aY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6R02QOCgIPWslloJiox9BjhuIrZ3CRli_uOaWUztN7xk= BufferLoad: true BufferStore: true CUCount: null @@ -20411,7 +24045,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20423,45 +24057,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 0 LSCA: 128 LSCB: 128 - LSPA: 4 - LSPB: 4 + LSPA: 8 + LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 99840 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 66560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -20471,15 +24105,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20507,15 +24141,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20531,8 +24165,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 85 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -20542,23 +24176,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20577,14 +24211,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -20621,6 +24255,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AxfD8GQK1funv0jxGwFu2nRz6aluGc9YL9jcdz-7fEk= BufferLoad: true BufferStore: true CUCount: null @@ -20663,8 +24298,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 LSCA: 128 LSCB: 128 LSPA: 8 @@ -20676,21 +24311,21 @@ LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 33280 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 66560 LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 164352 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -20712,13 +24347,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 + MIWaveTile: [8, 4] + MIWaveTileA: 8 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -20747,13 +24382,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -20771,8 +24406,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 86 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -20795,9 +24430,9 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 4 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -20825,7 +24460,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -20842,14 +24477,14 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -20861,7 +24496,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3cSFHPElhrbZabUpjPK0idMvlUk8E6jLU5EZAOZd0T7g= BufferLoad: true BufferStore: true CUCount: null @@ -20871,7 +24505,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -20892,7 +24526,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20904,39 +24538,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 + LVCA: 32 + LVCB: 32 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 99840 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -20953,14 +24587,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20988,15 +24622,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -21012,18 +24646,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 87 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -21036,14 +24670,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -21058,23 +24692,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -21083,8 +24717,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -21102,17 +24736,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1I2reK521Z0OJ8WcGapE1-BfPXpcGPiwVna1dA3-Ll3c= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wZn1lj5-xh86AxV8XC_TfIrJlVoj_3cWBkWnmMVsin8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -21133,7 +24767,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21145,45 +24779,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 72704 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 72704 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 72704 + LdsOffsetMetadata_Blk: 148480 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -21193,15 +24827,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21229,14 +24863,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21245,7 +24879,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21253,34 +24887,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 88 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21299,23 +24933,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -21343,6 +24977,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65gE-MCZm4_zHo6C5uZDRUoKD8EXmAPJ3N4uk_Y8yfys= BufferLoad: true BufferStore: true CUCount: null @@ -21373,7 +25008,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21385,8 +25020,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -21395,24 +25030,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -21434,14 +25069,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21469,14 +25104,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21493,8 +25128,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 89 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -21504,7 +25139,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -21517,14 +25152,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -21539,15 +25174,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -21564,8 +25199,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -21583,7 +25218,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9yCMUvmmmKthOJiKqIB_mYxsvQAAqv6o_A39DiLPiA4s= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6I8O4paswkmb66D5oBp-XVQKwsaElMfkk9UWM0xAgsVs= BufferLoad: true BufferStore: true CUCount: null @@ -21614,7 +25249,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21626,8 +25261,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -21636,24 +25271,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24576 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 24576 - LdsNumElementsAlignedA: 15360 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 15360 - LdsOffsetB_Blk: 48128 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 48128 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -21675,13 +25310,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -21709,14 +25344,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -21734,8 +25369,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 90 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -21745,7 +25380,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -21758,9 +25393,9 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 12 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -21780,7 +25415,7 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -21805,8 +25440,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -21824,16 +25459,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6eRXgyMefjCuWXVIY2LbceXYoQhTimXSfxsYDWpNFOp0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -21866,34 +25502,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -21903,8 +25539,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -21914,14 +25550,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -21950,14 +25586,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21966,7 +25602,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21974,13 +25610,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 91 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -21990,22 +25626,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 16 - ThreadTileB: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -22021,22 +25657,22 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -22052,7 +25688,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -22064,7 +25700,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ULSHV5TvKv-3s7N5kPB29fM38e7xZ96gnPlfW9sr_dw= BufferLoad: true BufferStore: true CUCount: null @@ -22074,7 +25709,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -22095,7 +25730,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22107,39 +25742,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13312 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -22155,15 +25790,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22191,15 +25826,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22215,34 +25850,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 92 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22261,23 +25896,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -22286,8 +25921,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -22305,17 +25940,16 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Gtebt9rQIpwe9oFGHoWcPG_t3BC0dLSpyLHC5C0Kq0M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -22348,34 +25982,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -22385,8 +26019,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -22434,12 +26068,12 @@ NumElementsPerBatchStore: 8 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22448,7 +26082,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22456,13 +26090,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 93 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -22510,15 +26144,15 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -22527,8 +26161,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -22546,12 +26180,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6V3GWrV8q1KT6qcNF3NRQe6SaKzZRXLdh2CbWinGWLAU= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT189iqYKY55VEcPzDOXv9ylhp1p78P2DCBr2S6_6nafkw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -22577,7 +26211,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22589,7 +26223,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 LSCB: 64 @@ -22599,35 +26233,35 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 79872 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -22637,15 +26271,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22672,15 +26306,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22689,7 +26323,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22697,8 +26331,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 94 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -22708,23 +26342,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22743,14 +26377,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -22787,7 +26421,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zsxRwkMqBm0-RotUdPfZ2aFGnmO-YpXpopyCt-rV2s4= BufferLoad: true BufferStore: true CUCount: null @@ -22797,7 +26430,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -22818,7 +26451,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22830,39 +26463,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 2048 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67072 + LdsBytesNoAmax: 90624 LdsInitCVgprs: false - LdsNumBytes: 67072 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67072 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 154112 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -22878,15 +26511,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22913,15 +26546,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22938,33 +26571,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 95 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 20 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 20 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -22984,23 +26617,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -23028,17 +26661,16 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HCXW7Md-kYsZ-1UnmrrC7uG9YBlO7z5sEs9KMCRnZRY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -23059,7 +26691,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23071,34 +26703,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 + LdsBytesNoAmax: 87040 LdsInitCVgprs: false - LdsNumBytes: 26624 + LdsNumBytes: 87040 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedB: 69632 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 131072 LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB_Blk: 148480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 87040 + LdsOffsetMetadata_Blk: 148480 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -23108,8 +26740,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -23119,15 +26751,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23155,14 +26787,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23171,7 +26803,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23179,34 +26811,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 96 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23225,23 +26857,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -23250,14 +26882,14 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -23269,7 +26901,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6oabzLLQwOO03l3MEmNqlhJ6dLm9NnzDdPu-7gnuHCJQ= BufferLoad: true BufferStore: true CUCount: null @@ -23279,7 +26910,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -23292,7 +26923,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: 1 - ForceUnrollSubIter: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -23300,7 +26931,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23312,39 +26943,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 128000 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 128000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 164352 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -23360,15 +26991,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23395,14 +27026,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 6 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -23420,38 +27051,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 97 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -23466,23 +27097,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -23493,7 +27124,7 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 - numSubTiles: 1 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -23510,16 +27141,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1U7vOvdYFB8EejoI1DeZ2XXlfHVNEE5d0H12vbNNpATs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -23552,45 +27184,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99840 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 99840 - LdsNumElementsAlignedA: 66560 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 66560 - LdsOffsetB_Blk: 197632 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 99840 - LdsOffsetMetadata_Blk: 197632 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -23600,15 +27232,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [4, 4] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 160 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23636,14 +27268,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23652,7 +27284,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23660,13 +27292,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 98 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -23677,17 +27309,17 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 5 ThreadTileA: 16 - ThreadTileB: 4 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23707,22 +27339,22 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -23750,6 +27382,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3OvWxntshUWW3_A-VI6XNSghru3U4UDSTDhjcjVR3HKY= BufferLoad: true BufferStore: true CUCount: null @@ -23759,7 +27392,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -23780,7 +27413,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23792,39 +27425,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 66560 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 66560 - LdsOffsetB_Blk: 197632 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 99840 - LdsOffsetMetadata_Blk: 197632 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -23840,15 +27473,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23876,14 +27509,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23900,38 +27533,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 99 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -23946,23 +27579,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -23990,7 +27623,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wZn1lj5-xh86AxV8XC_TfIrJlVoj_3cWBkWnmMVsin8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT46N5m9gwCFO-Nzc9yENVU3SI1QrpCiOXEjeEPFSn86Hk= BufferLoad: true BufferStore: true CUCount: null @@ -24015,13 +27648,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24033,34 +27666,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 72704 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 72704 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 148480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 72704 - LdsOffsetMetadata_Blk: 148480 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -24082,14 +27715,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 384 - MacroTileA: 128 - MacroTileB: 384 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24116,15 +27749,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 12 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24141,8 +27774,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 100 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -24152,7 +27785,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -24165,14 +27798,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -24187,7 +27820,7 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -24231,7 +27864,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65gE-MCZm4_zHo6C5uZDRUoKD8EXmAPJ3N4uk_Y8yfys= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vaiUWgr6lZ68qXC_TGMU65523uOYKq3Ec6eraxi_h38= BufferLoad: true BufferStore: true CUCount: null @@ -24274,7 +27907,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 @@ -24287,21 +27920,21 @@ LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18432 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 18432 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -24322,15 +27955,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [4, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24360,12 +27993,12 @@ NumElementsPerBatchStore: 8 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24382,8 +28015,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 101 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -24398,10 +28031,10 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -24413,7 +28046,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -24435,7 +28068,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -24472,7 +28105,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6I8O4paswkmb66D5oBp-XVQKwsaElMfkk9UWM0xAgsVs= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1pFiexEy_nQA9jS434_CfI7aGjSUEPvYPE95_vzPQKTw= BufferLoad: true BufferStore: true CUCount: null @@ -24498,7 +28131,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true @@ -24515,34 +28148,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 + LVCB: 16 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18432 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 18432 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -24563,15 +28196,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24599,14 +28232,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24623,8 +28256,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 102 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -24639,18 +28272,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24670,13 +28303,13 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -24713,6 +28346,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1H4SRQMHnBm8MPmn7CC4vovQuF9Klt0xNrGusZalzPig= BufferLoad: true BufferStore: true CUCount: null @@ -24722,7 +28356,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -24755,39 +28389,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -24803,15 +28437,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [4, 4] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24842,11 +28476,11 @@ NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24863,13 +28497,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 103 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -24879,18 +28513,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 16 - ThreadTileB: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24910,22 +28544,22 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -24934,8 +28568,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -24953,7 +28587,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHDQbHdXWrhDPz3uAOriInqMw0_ypUmAB2yHcbrln9g= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1rXTuKdoyxclHidecoClA0AfcqpENiqcNipr2eRQNhwg= BufferLoad: true BufferStore: true CUCount: null @@ -24996,8 +28630,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 64 LSCB: 64 LSPA: 16 @@ -25007,13 +28641,13 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 + LdsBytesNoAmax: 79872 LdsInitCVgprs: false - LdsNumBytes: 67584 + LdsNumBytes: 79872 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedB: 46080 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 @@ -25022,7 +28656,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 + LdsOffsetMetadata: 79872 LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 @@ -25045,14 +28679,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] + MIWaveTile: [4, 5] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 160 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25080,14 +28714,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 10 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25104,8 +28738,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 104 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -25129,13 +28763,13 @@ SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 5 ThreadTileA: 16 - ThreadTileB: 4 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -25151,7 +28785,7 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -25175,8 +28809,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -25194,16 +28828,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9zD0PX1FPhAiuZDs_KjdPAn76aiPUEhRhVS2tkop5n3A= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -25224,7 +28859,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25236,34 +28871,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 + LdsBytesNoAmax: 44544 LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 79360 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 79360 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -25273,8 +28908,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -25284,15 +28919,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25319,15 +28954,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25336,7 +28971,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -25344,34 +28979,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 105 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25390,23 +29025,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -25415,8 +29050,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -25434,17 +29069,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT189iqYKY55VEcPzDOXv9ylhp1p78P2DCBr2S6_6nafkw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19UJ9YnHb6cR5JwasXZkP1sk4AUOzy6Nd_GCne8pVOR0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -25465,7 +29100,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25477,34 +29112,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 79872 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 79872 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 79872 - LdsOffsetMetadata_Blk: 177152 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -25514,8 +29149,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -25526,14 +29161,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25560,15 +29195,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 5 + NumElementsPerBatchStore: 8 NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 - NumLoadsB: 8 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25577,7 +29212,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -25585,18 +29220,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 106 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -25609,10 +29244,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25631,8 +29266,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -25644,10 +29279,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -25656,8 +29291,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -25675,6 +29310,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wWhUNhsLXnwdUgsCNHLafWrE-J4dg946fN2_q4HV5w8= BufferLoad: true BufferStore: true CUCount: null @@ -25705,7 +29341,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25717,8 +29353,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 64 LSCB: 64 LSPA: 16 @@ -25727,24 +29363,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 90624 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 90624 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 154112 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 90624 - LdsOffsetMetadata_Blk: 154112 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -25765,15 +29401,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 256 - MacroTileA: 80 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25800,15 +29436,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 5 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25825,8 +29461,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 107 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -25836,23 +29472,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25871,15 +29507,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -25896,8 +29532,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -25915,7 +29551,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1cxVKEoGpectXJ4hizehb-leeaygHA2aT8hzudE-aBUA= BufferLoad: true BufferStore: true CUCount: null @@ -25925,7 +29560,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -25958,39 +29593,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 81920 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -26006,15 +29641,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [2, 1] MIWaveTile: [4, 5] MIWaveTileA: 4 MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 160 + MacroTile1: 80 MacroTileA: 128 - MacroTileB: 160 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26042,13 +29677,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 + NumLoadsA: 8 NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 @@ -26066,13 +29701,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 108 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -26083,9 +29718,9 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -26097,7 +29732,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -26119,16 +29754,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -26156,7 +29791,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-XbEyQ5BisbWWMjTpMs98hyJ-dhnlHSIfClgvFcUEQk= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KkEUdHGIe6dKJ0zzMlcYFBbwtI1FwPuq4-b2hkE_RVc= BufferLoad: true BufferStore: true CUCount: null @@ -26187,7 +29822,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26199,7 +29834,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 @@ -26209,24 +29844,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -26248,14 +29883,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26283,14 +29918,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26307,8 +29942,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 109 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -26318,7 +29953,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -26331,14 +29966,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -26353,8 +29988,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -26397,7 +30032,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1pFiexEy_nQA9jS434_CfI7aGjSUEPvYPE95_vzPQKTw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65_iMlfIkk8B93jR0j9fItJcWD8qLSyBShDieS7L1wt0= BufferLoad: true BufferStore: true CUCount: null @@ -26407,7 +30042,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -26423,7 +30058,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true @@ -26440,39 +30075,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 31232 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 7680 + LdsNumBytes: 31232 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -26488,14 +30123,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] + MIWaveGroup: [2, 1] MIWaveTile: [2, 3] MIWaveTileA: 2 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 48 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -26524,8 +30159,8 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 NumLoadsA: 4 NumLoadsB: 3 NumLoadsCoalescedA: 1 @@ -26548,13 +30183,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 110 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -26564,9 +30199,9 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -26601,16 +30236,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -26619,8 +30254,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -26638,17 +30273,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1H4SRQMHnBm8MPmn7CC4vovQuF9Klt0xNrGusZalzPig= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9gGJDLTTlViLZGJavc0sPMnxgvCGjSJIqu4gt9wnKCkU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -26669,7 +30304,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26681,34 +30316,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 + LdsBytesNoAmax: 55296 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -26718,8 +30353,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -26730,14 +30365,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26764,15 +30399,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26781,7 +30416,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -26789,18 +30424,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 111 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -26813,14 +30448,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -26835,8 +30470,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -26848,10 +30483,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -26879,7 +30514,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9zD0PX1FPhAiuZDs_KjdPAn76aiPUEhRhVS2tkop5n3A= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45ulS7WcdI7UNW-ipBhI5_9g0NWEweK1v4iw5AdMdwyw= BufferLoad: true BufferStore: true CUCount: null @@ -26904,13 +30539,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26922,34 +30557,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 44544 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 44544 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 30720 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 79360 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 44544 - LdsOffsetMetadata_Blk: 79360 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -26971,14 +30606,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 192 - MacroTileA: 96 - MacroTileB: 192 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27006,14 +30641,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 6 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 36 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 NumLoadsA: 3 - NumLoadsB: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27030,8 +30665,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 112 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -27041,7 +30676,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -27054,14 +30689,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 3 - ThreadTileA: 24 - ThreadTileB: 3 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -27076,8 +30711,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -27120,7 +30755,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19UJ9YnHb6cR5JwasXZkP1sk4AUOzy6Nd_GCne8pVOR0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45Pf77PZ0y2a63oqBv5xuSEpvYfkv5rrfz2OggkgqUbU= BufferLoad: true BufferStore: true CUCount: null @@ -27145,13 +30780,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27163,34 +30798,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -27211,15 +30846,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27246,15 +30881,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27271,8 +30906,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 113 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -27282,23 +30917,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27317,14 +30952,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -27361,17 +30996,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wWhUNhsLXnwdUgsCNHLafWrE-J4dg946fN2_q4HV5w8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-PpbdGeEL52tGWSKuV1d3aERKjhvbI9dAbn_gKJooqY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -27392,7 +31027,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27404,34 +31039,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 51200 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 57856 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -27441,8 +31076,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -27452,15 +31087,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 5] + MIWaveTileA: 4 MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27488,14 +31123,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 40 + NumElementsPerThread: 80 NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 + NumLoadsA: 2 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27504,7 +31139,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27512,38 +31147,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 114 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 5 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -27558,23 +31193,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -27602,16 +31237,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zKzQos5jBY-oZsuABvfgC-YVr0XQkeZ9FoaVzwSEWkA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -27626,7 +31262,7 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel @@ -27644,34 +31280,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS7_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 79872 + LdsBytesNoAmax: 48640 LdsInitCVgprs: false - LdsNumBytes: 79872 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 48640 + LdsNumElementsAlignedA: 17920 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 83456 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 79872 - LdsOffsetMetadata_Blk: 177152 + LdsOffsetMetadata: 48640 + LdsOffsetMetadata_Blk: 83456 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -27681,8 +31317,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -27692,15 +31328,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 3] + MIWaveTileA: 7 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 112 + MacroTile1: 192 + MacroTileA: 112 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27727,15 +31363,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 5 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 - NumLoadsB: 8 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 7 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27744,7 +31380,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27752,13 +31388,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 115 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS7_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -27768,18 +31404,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 28 + ThreadTile1: 3 + ThreadTileA: 28 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27799,22 +31435,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -27823,8 +31459,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -27842,7 +31478,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT31zhAEgAY7p8a5We0BzsiowfNKry6VBUHEQOZboSmQsk= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3v1NwFnVXJTJCep_voPt7RyBBkZRSCMUMhO_N-SZ5gL8= BufferLoad: true BufferStore: true CUCount: null @@ -27852,7 +31488,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -27885,39 +31521,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -27933,15 +31569,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 48 + MacroTile1: 128 MacroTileA: 32 - MacroTileB: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27969,14 +31605,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 6 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27993,13 +31629,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 116 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -28010,17 +31646,17 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 8 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28040,22 +31676,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -28083,6 +31719,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6C54sGtPjCJ3V-cPRAE3Ns8NHt0h_voSZTrp24ROARTw= BufferLoad: true BufferStore: true CUCount: null @@ -28092,7 +31729,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -28125,39 +31762,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 81920 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 81920 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 56832 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -28173,15 +31810,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [4, 5] + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 80 - MacroTileA: 128 - MacroTileB: 80 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28209,14 +31846,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 10 - NumLoadsA: 8 - NumLoadsB: 5 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28233,13 +31870,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 117 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -28249,22 +31886,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 5 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 5 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -28280,22 +31917,22 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -28323,6 +31960,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6uAdk8COtu0uSDFSBZJBI7WxTlIvedN3IPIedLDwGas0= BufferLoad: true BufferStore: true CUCount: null @@ -28348,12 +31986,12 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28365,34 +32003,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 7680 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -28413,15 +32051,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28449,14 +32087,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28473,8 +32111,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 118 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -28484,27 +32122,27 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -28519,15 +32157,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -28563,17 +32201,16 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45Pf77PZ0y2a63oqBv5xuSEpvYfkv5rrfz2OggkgqUbU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -28588,13 +32225,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28606,34 +32243,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 7680 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 7680 - LdsOffsetB_Blk: 40448 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 40448 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -28643,8 +32280,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -28654,14 +32291,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 48 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 48 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -28689,15 +32326,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28706,7 +32343,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28714,38 +32351,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 119 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 2 - ThreadTileA: 12 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -28760,23 +32397,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -28804,7 +32441,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-PpbdGeEL52tGWSKuV1d3aERKjhvbI9dAbn_gKJooqY= BufferLoad: true BufferStore: true CUCount: null @@ -28835,7 +32471,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28847,8 +32483,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -28857,24 +32493,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 94720 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 51200 + LdsNumBytes: 94720 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 71680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59904 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 94720 + LdsOffsetMetadata_Blk: 154112 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -28896,14 +32532,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTile: [10, 7] + MIWaveTileA: 10 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 320 - MacroTileA: 64 - MacroTileB: 320 + MacroTile0: 160 + MacroTile1: 448 + MacroTileA: 160 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28930,15 +32566,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 2 - NumLoadsB: 10 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 5 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 14 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28955,8 +32591,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 120 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -28966,7 +32602,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -28979,14 +32615,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 + ThreadTile0: 40 + ThreadTile1: 7 + ThreadTileA: 40 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -29001,7 +32637,7 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -29009,7 +32645,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -29026,8 +32662,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -29045,17 +32681,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3v1NwFnVXJTJCep_voPt7RyBBkZRSCMUMhO_N-SZ5gL8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hKaBSm-LDSAQofLbqhwHQZEeIsXzaeyVIdwWTVQFEe8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29088,34 +32724,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 37376 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29125,8 +32761,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -29136,15 +32772,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] + MIWaveGroup: [2, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29174,11 +32810,11 @@ NumElementsPerBatchStore: 8 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -29188,7 +32824,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -29196,13 +32832,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 121 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -29212,10 +32848,10 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -29249,16 +32885,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -29286,7 +32922,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3OvWxntshUWW3_A-VI6XNSghru3U4UDSTDhjcjVR3HKY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6sFfDRoimNH8imnTTvcLS2cE_cU4dhe0vq9JydCcQHaY= BufferLoad: true BufferStore: true CUCount: null @@ -29329,7 +32965,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 @@ -29340,23 +32976,23 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 37376 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 74752 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29377,15 +33013,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29413,14 +33049,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29437,8 +33073,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 122 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -29453,22 +33089,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 5 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -29484,13 +33120,13 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -29527,7 +33163,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6C54sGtPjCJ3V-cPRAE3Ns8NHt0h_voSZTrp24ROARTw= BufferLoad: true BufferStore: true CUCount: null @@ -29558,7 +33193,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29570,8 +33205,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -29580,24 +33215,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 22528 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 37888 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29618,15 +33253,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29654,14 +33289,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29678,8 +33313,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 123 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -29689,27 +33324,27 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -29724,15 +33359,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -29749,8 +33384,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -29768,16 +33403,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1EcUjFXB929D0f2TV76fVey2j0aZunr_I-f76y9iEKD8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29798,7 +33434,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29810,34 +33446,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29847,8 +33483,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -29858,15 +33494,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29894,14 +33530,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29910,7 +33546,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -29918,34 +33554,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 124 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29964,23 +33600,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -29989,8 +33625,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -30008,7 +33644,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6uAdk8COtu0uSDFSBZJBI7WxTlIvedN3IPIedLDwGas0= BufferLoad: true BufferStore: true CUCount: null @@ -30051,8 +33686,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -30159,8 +33794,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 125 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -30213,7 +33848,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -30230,8 +33865,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -30249,17 +33884,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xL-j8Nchxvh82x3gA1ZugFs4j6lELgVzV2fA-hmOsN8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6mLBE_Dn-_3QTzaoio4b0fzaYzT15iyos9J3lxNRut4U= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30280,7 +33915,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30292,34 +33927,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 + LdsBytesNoAmax: 24576 LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -30329,8 +33964,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -30341,14 +33976,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -30376,14 +34011,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -30392,7 +34027,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30400,18 +34035,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 126 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -30424,14 +34059,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -30446,8 +34081,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -30459,10 +34094,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -30490,6 +34125,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3UA89VtDjVMYF_nVEcLHiT6ehHteA9v56y_f3v5d_CYI= BufferLoad: true BufferStore: true CUCount: null @@ -30499,7 +34135,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30532,39 +34168,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 94720 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 94720 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 71680 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 154112 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 94720 - LdsOffsetMetadata_Blk: 154112 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -30580,15 +34216,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 7] - MIWaveTileA: 10 - MIWaveTileB: 7 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 448 - MacroTileA: 160 - MacroTileB: 448 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -30615,15 +34251,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 280 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 5 - NumLoadsB: 14 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -30640,13 +34276,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 127 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -30657,21 +34293,21 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 7 - ThreadTileA: 40 - ThreadTileB: 7 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -30687,22 +34323,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -30711,14 +34347,14 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -30739,7 +34375,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30752,7 +34388,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: 1 - ForceUnrollSubIter: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -30760,7 +34396,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30772,39 +34408,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 128000 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 66560 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 128000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 66560 - LdsOffsetB_Blk: 197632 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 99840 - LdsOffsetMetadata_Blk: 197632 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -30820,15 +34456,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -30855,14 +34491,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 6 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -30880,34 +34516,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 128 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -30926,23 +34562,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -30951,9 +34587,9 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 1 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -30970,7 +34606,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1EcUjFXB929D0f2TV76fVey2j0aZunr_I-f76y9iEKD8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6Q86Al2n33vUP8nQtrYZlZYGsrzmmIC_0MPgq_9eYXoc= BufferLoad: true BufferStore: true CUCount: null @@ -30980,7 +34616,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31001,7 +34637,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31013,39 +34649,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -31061,15 +34697,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31097,14 +34733,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -31121,38 +34757,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 129 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -31167,23 +34803,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -31253,7 +34889,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 32 LSCB: 32 @@ -31264,23 +34900,23 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 100864 LdsInitCVgprs: false - LdsNumBytes: 43520 + LdsNumBytes: 100864 LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedB: 92160 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 131072 LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetB_Blk: 139776 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 100864 + LdsOffsetMetadata_Blk: 139776 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -31302,14 +34938,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] + MIWaveTile: [4, 9] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 9 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 256 + MacroTile1: 576 MacroTileA: 64 - MacroTileB: 256 + MacroTileB: 576 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31337,14 +34973,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 NumLoadsA: 2 - NumLoadsB: 8 + NumLoadsB: 18 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 18 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -31361,8 +34997,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 130 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -31386,9 +35022,9 @@ SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 9 ThreadTileA: 16 - ThreadTileB: 4 + ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -31408,7 +35044,7 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -31451,7 +35087,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6mLBE_Dn-_3QTzaoio4b0fzaYzT15iyos9J3lxNRut4U= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hbz-dyf7FwiPWJ4x45PK0ZJquMKV4n_z15-8SqAfiXQ= BufferLoad: true BufferStore: true CUCount: null @@ -31461,7 +35097,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31482,7 +35118,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31494,39 +35130,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24576 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 24576 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -31542,15 +35178,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31578,14 +35214,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -31602,38 +35238,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 131 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -31648,23 +35284,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -31692,7 +35328,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3UA89VtDjVMYF_nVEcLHiT6ehHteA9v56y_f3v5d_CYI= BufferLoad: true BufferStore: true CUCount: null @@ -31735,7 +35370,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 128 LSCB: 128 @@ -31746,13 +35381,13 @@ LVPA: 2 LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 33792 + LdsNumBytes: 43008 LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -31761,7 +35396,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 + LdsOffsetMetadata: 43008 LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 @@ -31784,14 +35419,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 48 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31819,14 +35454,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -31843,8 +35478,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 132 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -31868,13 +35503,13 @@ SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -31890,14 +35525,14 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -31933,16 +35568,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xL-j8Nchxvh82x3gA1ZugFs4j6lELgVzV2fA-hmOsN8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31975,34 +35611,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100864 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 100864 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 92160 + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 100864 - LdsOffsetMetadata_Blk: 139776 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -32012,8 +35648,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -32023,15 +35659,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 9] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 9 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 576 - MacroTileA: 64 - MacroTileB: 576 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32059,14 +35695,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 2 - NumLoadsB: 18 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 18 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -32075,7 +35711,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32083,13 +35719,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 133 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -32099,22 +35735,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 9 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 9 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -32130,22 +35766,22 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -32154,8 +35790,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -32173,6 +35809,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT67lNyUdTmw-cOQZDz4eeTMX4-DhVCJMsSaMIubs8z554= BufferLoad: true BufferStore: true CUCount: null @@ -32182,7 +35819,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32203,7 +35840,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32215,39 +35852,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -32263,15 +35900,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 48 - MacroTileA: 32 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32299,14 +35936,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 6 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -32323,34 +35960,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 134 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32369,23 +36006,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -32394,8 +36031,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -32413,7 +36050,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hbz-dyf7FwiPWJ4x45PK0ZJquMKV4n_z15-8SqAfiXQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6eM9UlY0usrKvssU3Ar5zZiiJpm6PCupFKVpS-ZFqsX0= BufferLoad: true BufferStore: true CUCount: null @@ -32423,7 +36060,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32444,7 +36081,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32456,39 +36093,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 24576 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 164352 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -32504,15 +36141,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32540,14 +36177,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -32564,34 +36201,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 135 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32610,23 +36247,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -32642,7 +36279,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -32654,16 +36291,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ByCgfye7rWIPQ1YJohmIQ0lNCn8NrEKn7vXmxjyaZFM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32676,7 +36314,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: 1 - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -32684,7 +36322,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32696,45 +36334,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -32744,15 +36382,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 4] - MIWaveTileA: 10 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32779,15 +36417,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -32796,7 +36434,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32804,34 +36442,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 136 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 4 - ThreadTileA: 40 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32850,23 +36488,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -32875,9 +36513,9 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 2 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -32894,16 +36532,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6et-9-ykpgaai9G-L1VaR5RDg1xNbnI7j11JfZuQQm7o= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32936,34 +36575,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 51200 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59904 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -32973,8 +36612,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -32985,14 +36624,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 5] + MIWaveTile: [4, 1] MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 320 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 320 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -33020,14 +36659,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 2 - NumLoadsB: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -33036,7 +36675,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -33044,13 +36683,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 137 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -33069,9 +36708,9 @@ SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 5 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 5 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33098,15 +36737,15 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -33134,7 +36773,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT67lNyUdTmw-cOQZDz4eeTMX4-DhVCJMsSaMIubs8z554= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZglzLcUOj7D9-4HA3xQ6qm78zoBYVUYUrp0ST-4N2Tc= BufferLoad: true BufferStore: true CUCount: null @@ -33165,7 +36804,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -33177,7 +36816,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 @@ -33187,24 +36826,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 78336 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 78336 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 78336 + LdsOffsetMetadata_Blk: 154112 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -33226,14 +36865,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 160 + MacroTile1: 384 + MacroTileA: 160 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -33260,15 +36899,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 5 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -33285,8 +36924,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 138 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -33296,7 +36935,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -33309,10 +36948,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33331,8 +36970,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -33375,17 +37014,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6et-9-ykpgaai9G-L1VaR5RDg1xNbnI7j11JfZuQQm7o= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HKFsR1xpqIqB8iGFlETrmdcYqlXRxk0kl_LfZapGERo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33406,7 +37045,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -33418,34 +37057,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -33455,8 +37094,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -33466,10 +37105,10 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 64 @@ -33503,13 +37142,13 @@ NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -33518,7 +37157,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -33526,34 +37165,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 139 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33572,23 +37211,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -33597,8 +37236,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -33606,297 +37245,327 @@ tailLoopOptB: true - [2, 3, 0, 1] - - - [233, 128, 1024, 32] - - [90, 17.77] + - [60, 0.0] - - [512, 8192, 1, 3072] - [0, 0.0] - - [512, 8192, 1, 3960] - - [44, 0.0] + - [38, 0.0] - - [512, 8192, 1, 5640] - - [45, 0.0] + - [39, 0.0] - - [528, 8192, 1, 256] - - [106, 57.71] + - [109, 57.71] - - [528, 8192, 1, 512] - - [107, 74.32] + - [110, 74.32] - - [1024, 8192, 1, 1980] - [1, 0.0] - - [1024, 8192, 1, 3840] - [2, 0.0] - - [2440, 8192, 1, 128] - - [93, 59.84] + - [94, 59.84] - - [5640, 8192, 1, 128] - - [108, 66.59] + - [3, 0.0] - - [61, 128, 8192, 40] - - [109, 14.89] + - [40, 0.0] - - [128, 30, 8192, 4] - - [3, 0.0] + - [116, 2.44] - - [128, 33, 8192, 16] - - [110, 8.65] + - [117, 8.65] - - [128, 61, 8192, 40] - - [111, 18.88] + - [118, 18.88] - - [41, 17711, 1, 128] - - [116, 10.94] + - [77, 22531.0] - - [96, 17711, 1, 768] - - [4, 0.0] + - [126, 51.2] - - [256, 17711, 1, 887] - - [73, 166959.0] + - [119, 78.04] - - [384, 17711, 1, 2732] - - [5, 0.0] + - [4, 0.0] - - [960, 17711, 1, 128] - - [112, 55.24] + - [120, 55.24] - - [2480, 17711, 1, 128] - - [113, 63.59] + - [121, 63.59] - - [48, 124, 17711, 20] - - [6, 0.0] + - [127, 12.27] - - [128, 17711, 6, 128] - - [55, 0.0] + - [59, 0.0] - - [10, 655360, 1, 160] - - [7, 0.0] + - [5, 0.0] - - [28, 4096, 1, 256] - - [75, 11097.5] + - [151, 4.86] - - [32, 262144, 1, 57] - - [122, 17.74] + - [114, 17.74] - - [32, 262144, 1, 60] - - [121, 17.58] + - [131, 17.58] - - [32, 262144, 1, 82] - - [8, 0.0] + - [6, 0.0] - - [32, 262144, 1, 84] - - [9, 0.0] + - [138, 18.4] - - [48, 655360, 1, 192] - - [10, 0.0] + - [7, 0.0] - - [57, 4096, 1, 2048] - - [132, 27.48] + - [42, 0.0] - - [64, 4096, 1, 2048] - - [132, 29.94] + - [142, 29.94] - - [64, 102400, 1, 64] - - [46, 0.0] + - [133, 31.44] - - [64, 131072, 1, 128] - - [133, 40.14] + - [145, 40.14] - - [64, 527553, 1, 224] - - [138, 54.03] + - [149, 54.03] - - [64, 752863, 1, 224] - - [137, 55.19] + - [43, 0.0] - - [64, 806154, 1, 288] - - [125, 57.26] + - [44, 0.0] - - [72, 4096, 1, 256] - - [11, 0.0] + - [8, 0.0] - - [82, 4096, 1, 2048] - - [12, 0.0] + - [146, 31.41] - - [112, 655360, 1, 192] - - [13, 0.0] + - [130, 60.1] - - [116, 4096, 1, 256] - - [14, 0.0] + - [9, 0.0] - - [128, 4096, 1, 1600] - - [135, 40.1] + - [146, 40.1] - - [128, 131072, 1, 64] - - [15, 0.0] + - [10, 0.0] - - [160, 4096, 1, 512] - - [134, 28.98] + - [147, 28.98] - - [160, 4096, 1, 2048] - - [48, 0.0] + - [46, 0.0] - - [180, 4096, 1, 256] - - [16, 0.0] + - [11, 0.0] - - [256, 4096, 1, 28] - - [17, 0.0] + - [104, 4.69] - - [256, 4096, 1, 72] - - [18, 0.0] + - [12, 0.0] - - [256, 4096, 1, 116] - - [19, 0.0] + - [13, 0.0] - - [256, 4096, 1, 256] - - [76, 63493.6] + - [136, 27.9] - - [256, 4096, 1, 4132] - - [128, 73.06] + - [49, 0.0] - - [256, 4096, 1, 7680] - - [20, 0.0] + - [14, 0.0] - - [304, 655360, 1, 644] - - [127, 94.46] + - [135, 94.46] - - [320, 4096, 1, 116] - - [21, 0.0] + - [15, 0.0] - - [320, 4096, 1, 180] - - [22, 0.0] + - [16, 0.0] - - [512, 4096, 1, 96] - - [23, 0.0] + - [17, 0.0] - - [512, 4096, 1, 160] - - [24, 0.0] + - [154, 37.99] - - [512, 4096, 1, 512] - - [25, 0.0] + - [144, 58.22] - - [512, 4096, 1, 2246] - - [26, 0.0] + - [148, 82.64] - - [512, 4096, 1, 4132] - - [126, 93.58] + - [52, 0.0] - - [512, 4096, 1, 7680] - - [124, 105.67] + - [134, 105.67] - - [2048, 4096, 1, 128] - - [108, 53.21] + - [54, 0.0] - - [2048, 4096, 1, 2048] - - [27, 0.0] + - [18, 0.0] - - [2048, 4096, 1, 2624] - - [28, 0.0] + - [19, 0.0] - - [2246, 4096, 1, 512] - - [29, 0.0] + - [20, 0.0] - - [2560, 4096, 1, 4096] - - [136, 133.67] + - [55, 0.0] - - [25, 25, 8192, 32] - - [53, 0.0] + - [56, 0.0] - - [32, 25, 8192, 25] - - [54, 0.0] + - [57, 0.0] - - [64, 57, 4096, 32] - - [102, 15.56] + - [105, 15.56] - - [64, 82, 4096, 32] - - [131, 18.45] + - [141, 18.45] - - [160, 642, 4096, 48] - - [30, 0.0] + - [153, 29.42] - - [200, 32, 4096, 64] - - [129, 18.97] + - [139, 18.97] - - [642, 160, 4096, 48] - - [31, 0.0] + - [21, 0.0] - - [128, 2048, 1, 256] - - [32, 0.0] + - [22, 0.0] - - [128, 2048, 1, 1024] - - [47, 0.0] + - [45, 0.0] - - [256, 2048, 1, 32] - - [33, 0.0] + - [23, 0.0] - - [256, 2048, 1, 36] - - [34, 0.0] + - [24, 0.0] - - [256, 2048, 1, 40] - - [35, 0.0] + - [25, 0.0] - - [256, 2048, 1, 48] - - [36, 0.0] + - [26, 0.0] - - [256, 2048, 1, 64] - - [71, 13745.7] + - [76, 13745.7] - - [256, 2048, 1, 72] - - [37, 0.0] + - [27, 0.0] - - [256, 2048, 1, 80] - - [38, 0.0] + - [28, 0.0] - - [256, 2048, 1, 96] - - [39, 0.0] + - [29, 0.0] - - [256, 2048, 1, 128] - - [40, 0.0] + - [30, 0.0] - - [256, 2048, 1, 256] - - [41, 0.0] + - [31, 0.0] - - [512, 2048, 1, 14336] - [50, 0.0] - - [120, 8192, 1, 256] - - [94, 24.98] + - [95, 24.98] - - [128, 8192, 1, 512] - - [97, 41.14] + - [99, 41.14] - - [128, 8192, 1, 4352] - - [98, 73.84] + - [32, 0.0] - - [128, 8192, 1, 5120] - - [99, 76.51] + - [33, 0.0] - - [128, 8192, 1, 7296] - - [42, 0.0] + - [34, 0.0] - - [128, 98304, 1, 256] - - [100, 73.12] + - [103, 73.12] - - [256, 8192, 1, 120] - - [101, 27.41] + - [104, 27.41] - - [256, 8192, 1, 128] - - [102, 32.62] + - [105, 32.62] - - [256, 8192, 1, 512] - - [103, 58.9] + - [35, 0.0] - - [256, 8192, 1, 4352] - - [43, 0.0] + - [36, 0.0] - - [512, 8192, 1, 1024] - - [104, 90.42] + - [80, 200366.0] - - [512, 8192, 1, 2048] - - [105, 102.4] + - [37, 0.0] - - [56, 131072, 1, 233] - - [78, 31.76] + - [41, 0.0] - - [64, 131072, 1, 64] - - [123, 36.24] + - [132, 36.24] - - [128, 1024, 1, 64] - - [84, 1.72] + - [87, 1.72] - - [128, 1024, 1, 72] - - [87, 1.7] + - [89, 1.7] - - [128, 1024, 1, 96] - - [92, 2.37] + - [93, 2.37] - - [128, 1024, 1, 128] - - [79, 3.08] + - [82, 3.08] - - [128, 1024, 1, 144] - - [85, 2.98] + - [88, 2.98] - - [128, 1024, 1, 4096] - - [82, 31.71] + - [85, 31.71] - - [128, 17711, 1, 128] - - [72, 58658.9] + - [125, 27.26] - - [256, 1024, 1, 7968] - - [86, 58.1] + - [47, 0.0] - - [256, 4096, 1, 180] - - [101, 20.26] + - [48, 0.0] - - [320, 4096, 1, 28] - - [49, 0.0] + - [150, 5.38] - - [320, 4096, 1, 72] - - [131, 11.57] + - [141, 11.57] - - [512, 1024, 1, 2011] - - [83, 46.44] + - [86, 46.44] - - [512, 4096, 1, 80] - [51, 0.0] - - [1024, 2048, 1, 14336] - - [52, 0.0] + - [53, 0.0] - - [2011, 1024, 1, 512] - - [80, 58.83] + - [83, 58.83] - - [7456, 1024, 1, 128] - - [89, 58.56] + - [91, 58.56] - - [64, 4096, 96, 160] - - [130, 45.87] + - [140, 45.87] - - [124, 48, 17711, 20] - - [118, 10.87] + - [58, 0.0] - - [128, 233, 1024, 32] - - [93, 25.13] + - [94, 25.13] - - [64, 9419, 1, 5120] - - [65, 0.0] + - [70, 0.0] - - [64, 9420, 1, 5120] - - [56, 0.0] + - [61, 0.0] - - [64, 18389, 1, 5120] - - [57, 0.0] + - [62, 0.0] - - [64, 18392, 1, 5120] - - [58, 0.0] + - [63, 0.0] - - [64, 21090, 1, 5120] - - [68, 0.0] + - [73, 0.0] - - [64, 21092, 1, 5120] - - [59, 0.0] + - [64, 0.0] - - [5120, 1, 1, 256] - - [60, 0.0] + - [65, 0.0] - - [5120, 1, 1, 5120] - - [61, 0.0] + - [66, 0.0] - - [30720, 1, 1, 5120] - - [62, 0.0] + - [67, 0.0] - - [64, 4106, 1, 5120] - - [63, 0.0] + - [68, 0.0] - - [64, 4200, 1, 5120] - - [64, 0.0] + - [69, 0.0] - - [64, 9450, 1, 5120] - - [66, 0.0] + - [71, 0.0] - - [64, 9452, 1, 5120] - - [67, 0.0] + - [72, 0.0] - - [64, 21263, 1, 5120] - - [69, 0.0] + - [74, 0.0] - - [64, 21264, 1, 5120] - - [70, 0.0] + - [75, 0.0] - - [128, 17711, 1, 928] - - [117, 60.29] + - [123, 60.29] - - [17711, 246, 1, 384] - - [115, 63.33] + - [78, 123978.0] - - [120, 8192, 1, 512] - - [95, 36.23] + - [96, 36.23] - - [128, 8192, 1, 64] - - [96, 10.73] + - [97, 10.73] - - [512, 8192, 1, 256] - - [74, 128855.0] + - [79, 128855.0] - - [512, 4096, 1, 64] - - [139, 18.43] + - [152, 18.43] - - [4096, 1024, 1, 128] - - [77, 87046.7] + - [81, 87046.7] - - [128, 1024, 1, 512] - - [81, 9.27] + - [84, 9.27] - - [128, 1024, 1, 256] - - [88, 5.58] + - [90, 5.58] - - [7968, 1024, 1, 256] - - [91, 77.41] + - [92, 77.41] + - - [128, 8192, 1, 128] + - [98, 18.51] + - - [128, 8192, 1, 5640] + - [100, 70.94] + - - [128, 8192, 1, 6912] + - [101, 76.65] + - - [128, 8192, 1, 10880] + - [102, 78.44] + - - [256, 8192, 1, 256] + - [106, 45.29] + - - [256, 8192, 1, 528] + - [107, 52.71] + - - [256, 8192, 1, 4608] + - [108, 92.0] + - - [1980, 8192, 1, 512] + - [111, 98.91] + - - [3072, 8192, 1, 512] + - [112, 110.5] + - - [5120, 8192, 1, 128] + - [113, 69.7] + - - [32, 128, 8192, 4] + - [114, 2.59] + - - [36, 128, 8192, 16] + - [115, 8.31] - - [128, 17711, 1, 256] - - [114, 41.1] + - [122, 41.1] + - - [128, 17711, 1, 252] + - [124, 34.87] - - [41, 128, 17711, 6] - - [119, 3.24] + - [128, 3.24] - - [64, 819200, 1, 64] - - [120, 34.01] + - [129, 34.01] + - - [192, 160, 4096, 48] + - [137, 26.38] + - - [9216, 4096, 1, 512] + - [143, 114.52] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml index bbe51f2e23c..44c25a0a363 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml @@ -90,17 +90,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9yCMUvmmmKthOJiKqIB_mYxsvQAAqv6o_A39DiLPiA4s= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3ZQt4emcWG8guHWikbv6OxFJy790l58gtvM3nfZjMIJE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -133,45 +133,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24576 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 24576 - LdsNumElementsAlignedA: 15360 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 15360 - LdsOffsetB_Blk: 48128 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 48128 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -181,15 +181,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -216,14 +216,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -233,7 +233,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -242,12 +242,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -258,17 +258,17 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 2 - ThreadTileA: 12 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -288,22 +288,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -312,8 +312,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -331,7 +331,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT8eZn8BuF1Nziw24iNKnDP44-Wc-OfwT54KqMWpiGeWHs= BufferLoad: true BufferStore: true CUCount: null @@ -341,7 +340,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -356,13 +355,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -374,39 +373,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 82432 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 82432 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 69632 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 143872 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 82432 - LdsOffsetMetadata_Blk: 143872 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -422,15 +421,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 8] - MIWaveTileA: 5 - MIWaveTileB: 8 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 512 - MacroTileA: 80 - MacroTileB: 512 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -457,15 +456,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 5 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 5 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -483,33 +482,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 8 - ThreadTileA: 20 - ThreadTileB: 8 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -528,23 +527,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -572,7 +571,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3ZQt4emcWG8guHWikbv6OxFJy790l58gtvM3nfZjMIJE= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3e4Dw_hz57yPEZN_qoLaorfGepQNCz75gt6VQs5_mgZo= BufferLoad: true BufferStore: true CUCount: null @@ -582,7 +581,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -603,7 +602,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -615,39 +614,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -663,9 +662,9 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 @@ -700,13 +699,13 @@ NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -724,32 +723,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -769,23 +768,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -794,8 +793,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -813,6 +812,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nB1C3iOYxU_7DUOjuOyODiZ9rVSorIjLm4u2U10uDaw= BufferLoad: true BufferStore: true CUCount: null @@ -822,7 +822,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -855,39 +855,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34304 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 34304 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34304 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -903,15 +903,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [1, 1] MIWaveTile: [4, 2] MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -939,13 +939,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -964,12 +964,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -980,9 +980,9 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -1016,16 +1016,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -1053,17 +1053,16 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3e4Dw_hz57yPEZN_qoLaorfGepQNCz75gt6VQs5_mgZo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -1084,7 +1083,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1096,34 +1095,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50176 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 50176 + LdsNumBytes: 66560 LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 131072 LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50176 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -1133,8 +1132,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -1145,14 +1144,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1180,14 +1179,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1196,7 +1195,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1205,17 +1204,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -1228,10 +1227,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -1250,23 +1249,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -1282,7 +1281,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -1294,17 +1293,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nB1C3iOYxU_7DUOjuOyODiZ9rVSorIjLm4u2U10uDaw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3SwF0bvQxB0RrhRxtPMVt3TyizF16j4vW99jq_X9KpHk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -1325,7 +1324,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1337,45 +1336,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 128 + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 32 - LVCB: 32 + LVCA: 16 + LVCB: 16 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50176 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50176 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -1385,15 +1384,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1421,15 +1420,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -1437,7 +1436,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1446,33 +1445,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -1491,23 +1490,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -1535,11 +1534,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3YeRtOHUsp9ttyStfNKrKENY_vaUUjqgLCrwM91_2-aY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -1565,7 +1565,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1577,45 +1577,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 LSCA: 128 LSCB: 128 - LSPA: 8 - LSPB: 8 + LSPA: 4 + LSPB: 4 LVCA: 32 LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 164352 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -1625,15 +1625,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1661,15 +1661,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -1677,7 +1677,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1686,7 +1686,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -1696,23 +1696,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -1731,15 +1731,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -1775,7 +1775,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3SwF0bvQxB0RrhRxtPMVt3TyizF16j4vW99jq_X9KpHk= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3cSFHPElhrbZabUpjPK0idMvlUk8E6jLU5EZAOZd0T7g= BufferLoad: true BufferStore: true CUCount: null @@ -1818,7 +1818,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 1 LSCA: 64 LSCB: 64 @@ -1927,7 +1927,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -1957,7 +1957,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -2016,7 +2016,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3YeRtOHUsp9ttyStfNKrKENY_vaUUjqgLCrwM91_2-aY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1I2reK521Z0OJ8WcGapE1-BfPXpcGPiwVna1dA3-Ll3c= BufferLoad: true BufferStore: true CUCount: null @@ -2026,7 +2026,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -2059,45 +2059,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 + LSCA: 256 + LSCB: 256 LSPA: 4 LSPB: 4 - LVCA: 32 - LVCB: 32 + LVCA: 64 + LVCB: 64 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -2107,14 +2107,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -2143,15 +2143,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -2168,12 +2168,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 1024 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -2183,9 +2183,9 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -2198,7 +2198,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -2220,16 +2220,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -2266,7 +2266,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -2299,39 +2299,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 164352 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -2347,15 +2347,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 2] MIWaveTile: [4, 4] MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2383,14 +2383,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2408,12 +2408,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -2423,10 +2423,10 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -2460,16 +2460,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -2485,7 +2485,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -2497,17 +2497,16 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3cSFHPElhrbZabUpjPK0idMvlUk8E6jLU5EZAOZd0T7g= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -2528,7 +2527,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2540,34 +2539,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -2577,8 +2576,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -2588,15 +2587,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2624,15 +2623,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -2640,7 +2639,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2649,33 +2648,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2694,23 +2693,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -2719,14 +2718,14 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -2738,17 +2737,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1I2reK521Z0OJ8WcGapE1-BfPXpcGPiwVna1dA3-Ll3c= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ULSHV5TvKv-3s7N5kPB29fM38e7xZ96gnPlfW9sr_dw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -2781,45 +2780,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 13312 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -2865,15 +2864,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -2881,7 +2880,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2890,12 +2889,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -2920,7 +2919,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -2942,16 +2941,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -2979,6 +2978,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Gtebt9rQIpwe9oFGHoWcPG_t3BC0dLSpyLHC5C0Kq0M= BufferLoad: true BufferStore: true CUCount: null @@ -3021,8 +3021,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -3130,7 +3130,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -3160,7 +3160,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -3183,7 +3183,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -3200,8 +3200,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -3219,6 +3219,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6V3GWrV8q1KT6qcNF3NRQe6SaKzZRXLdh2CbWinGWLAU= BufferLoad: true BufferStore: true CUCount: null @@ -3228,7 +3229,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -3249,7 +3250,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3261,39 +3262,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 34816 + LdsNumBytes: 26112 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -3309,15 +3310,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3345,14 +3346,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3370,37 +3371,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -3415,23 +3416,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -3447,7 +3448,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -3459,17 +3460,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ULSHV5TvKv-3s7N5kPB29fM38e7xZ96gnPlfW9sr_dw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zsxRwkMqBm0-RotUdPfZ2aFGnmO-YpXpopyCt-rV2s4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -3490,7 +3491,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3502,45 +3503,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 + LVCA: 32 + LVCB: 32 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13312 + LdsBytesNoAmax: 67072 LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -3550,15 +3551,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3586,15 +3587,15 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -3602,7 +3603,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3611,33 +3612,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3656,33 +3657,33 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBufferSingleKernel - _UseSgprForGRO: 1 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -3700,17 +3701,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Gtebt9rQIpwe9oFGHoWcPG_t3BC0dLSpyLHC5C0Kq0M= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HCXW7Md-kYsZ-1UnmrrC7uG9YBlO7z5sEs9KMCRnZRY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -3731,7 +3732,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3743,34 +3744,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 34816 + LdsNumBytes: 26624 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3780,8 +3781,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -3792,14 +3793,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3827,14 +3828,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3843,7 +3844,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3852,17 +3853,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -3875,10 +3876,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3897,8 +3898,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -3910,10 +3911,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -3922,8 +3923,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -3941,12 +3942,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6V3GWrV8q1KT6qcNF3NRQe6SaKzZRXLdh2CbWinGWLAU= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68SIh_bnF1917_7343uVm45oe_0GQqSHLfLK0W_wi_u8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -3984,7 +3985,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 LSCB: 64 @@ -3997,32 +3998,32 @@ LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 26112 + LdsNumBytes: 34816 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -4032,15 +4033,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [2, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4068,14 +4069,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4084,7 +4085,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4093,7 +4094,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -4109,9 +4110,9 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -4145,7 +4146,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -4182,12 +4183,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zsxRwkMqBm0-RotUdPfZ2aFGnmO-YpXpopyCt-rV2s4= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6oabzLLQwOO03l3MEmNqlhJ6dLm9NnzDdPu-7gnuHCJQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -4213,7 +4214,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4225,8 +4226,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 LSCA: 128 LSCB: 128 LSPA: 8 @@ -4235,35 +4236,35 @@ LVCB: 32 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67072 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 67072 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67072 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -4273,9 +4274,9 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 @@ -4310,7 +4311,7 @@ NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -4325,7 +4326,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4334,7 +4335,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -4344,22 +4345,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -4379,14 +4380,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -4404,8 +4405,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -4423,7 +4424,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HCXW7Md-kYsZ-1UnmrrC7uG9YBlO7z5sEs9KMCRnZRY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6R02QOCgIPWslloJiox9BjhuIrZ3CRli_uOaWUztN7xk= BufferLoad: true BufferStore: true CUCount: null @@ -4433,7 +4434,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -4454,7 +4455,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4466,39 +4467,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 + LdsBytesNoAmax: 99840 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 66560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -4514,15 +4515,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4550,14 +4551,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4575,37 +4576,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -4620,23 +4621,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -4664,7 +4665,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6oabzLLQwOO03l3MEmNqlhJ6dLm9NnzDdPu-7gnuHCJQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AxfD8GQK1funv0jxGwFu2nRz6aluGc9YL9jcdz-7fEk= BufferLoad: true BufferStore: true CUCount: null @@ -4707,8 +4708,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 LSCA: 128 LSCB: 128 LSPA: 8 @@ -4720,21 +4721,21 @@ LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 33280 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 66560 LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 164352 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -4756,13 +4757,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 + MIWaveTile: [8, 4] + MIWaveTileA: 8 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -4791,13 +4792,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -4816,7 +4817,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -4839,9 +4840,9 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 4 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -4886,8 +4887,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -5145,6 +5146,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wZn1lj5-xh86AxV8XC_TfIrJlVoj_3cWBkWnmMVsin8= BufferLoad: true BufferStore: true CUCount: null @@ -5154,7 +5156,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -5187,39 +5189,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 72704 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 66560 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 72704 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 66560 - LdsOffsetB_Blk: 197632 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 99840 - LdsOffsetMetadata_Blk: 197632 + LdsOffsetMetadata: 72704 + LdsOffsetMetadata_Blk: 148480 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -5235,15 +5237,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [8, 4] + MIWaveGroup: [1, 4] + MIWaveTile: [8, 6] MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveTileB: 6 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 384 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5271,14 +5273,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5296,12 +5298,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -5312,21 +5314,21 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 32 - ThreadTileB: 4 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -5342,22 +5344,22 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -5366,8 +5368,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -5385,7 +5387,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wZn1lj5-xh86AxV8XC_TfIrJlVoj_3cWBkWnmMVsin8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65gE-MCZm4_zHo6C5uZDRUoKD8EXmAPJ3N4uk_Y8yfys= BufferLoad: true BufferStore: true CUCount: null @@ -5416,7 +5418,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5428,8 +5430,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -5438,24 +5440,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 72704 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 72704 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 148480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 72704 - LdsOffsetMetadata_Blk: 148480 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5476,15 +5478,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 6] - MIWaveTileA: 8 - MIWaveTileB: 6 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 384 - MacroTileA: 128 - MacroTileB: 384 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5512,14 +5514,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5537,7 +5539,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -5547,23 +5549,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 6 - ThreadTileA: 32 - ThreadTileB: 6 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5582,14 +5584,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -5607,8 +5609,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -5626,7 +5628,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65gE-MCZm4_zHo6C5uZDRUoKD8EXmAPJ3N4uk_Y8yfys= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6I8O4paswkmb66D5oBp-XVQKwsaElMfkk9UWM0xAgsVs= BufferLoad: true BufferStore: true CUCount: null @@ -5669,7 +5671,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 @@ -5778,7 +5780,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -5808,7 +5810,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -5867,17 +5869,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6I8O4paswkmb66D5oBp-XVQKwsaElMfkk9UWM0xAgsVs= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6eRXgyMefjCuWXVIY2LbceXYoQhTimXSfxsYDWpNFOp0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -5898,7 +5900,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5910,34 +5912,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18432 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 18432 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5947,8 +5949,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -5958,15 +5960,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5994,14 +5996,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6010,7 +6012,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6019,32 +6021,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -6064,23 +6066,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -6089,8 +6091,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -6150,8 +6152,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -6166,18 +6168,18 @@ LdsBytesNoAmax: 65536 LdsInitCVgprs: false LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 50688 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -6198,15 +6200,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [2, 1] MIWaveTile: [4, 4] MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6236,12 +6238,12 @@ NumElementsPerBatchStore: 8 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6259,7 +6261,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -6274,10 +6276,10 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -6289,7 +6291,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -6311,7 +6313,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] + WorkGroup: [32, 4, 2] WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -6329,8 +6331,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -6348,7 +6350,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHDQbHdXWrhDPz3uAOriInqMw0_ypUmAB2yHcbrln9g= BufferLoad: true BufferStore: true CUCount: null @@ -6391,8 +6392,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 64 LSCB: 64 LSPA: 16 @@ -6500,7 +6501,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -6553,7 +6554,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -6570,8 +6571,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -6589,6 +6590,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT189iqYKY55VEcPzDOXv9ylhp1p78P2DCBr2S6_6nafkw= BufferLoad: true BufferStore: true CUCount: null @@ -6619,7 +6621,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6631,8 +6633,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 64 LSCB: 64 LSPA: 16 @@ -6641,24 +6643,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 + LdsBytesNoAmax: 79872 LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -6680,13 +6682,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 + MIWaveTile: [5, 4] + MIWaveTileA: 5 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 160 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 160 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -6714,14 +6716,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 10 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -6740,7 +6742,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -6750,7 +6752,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -6763,9 +6765,9 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 20 ThreadTile1: 4 - ThreadTileA: 16 + ThreadTileA: 20 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -6785,7 +6787,7 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -6793,7 +6795,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -6810,8 +6812,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -6829,7 +6831,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT189iqYKY55VEcPzDOXv9ylhp1p78P2DCBr2S6_6nafkw= BufferLoad: true BufferStore: true CUCount: null @@ -6872,8 +6873,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -6885,21 +6886,21 @@ LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 79872 + LdsBytesNoAmax: 90624 LdsInitCVgprs: false - LdsNumBytes: 79872 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 79872 - LdsOffsetMetadata_Blk: 177152 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 154112 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -6920,15 +6921,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 4] MIWaveTile: [5, 4] MIWaveTileA: 5 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6958,12 +6959,12 @@ NumElementsPerBatchStore: 5 NumElementsPerThread: 80 NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 - NumLoadsB: 8 + NumLoadsA: 5 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6981,7 +6982,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -6996,10 +6997,10 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -7033,8 +7034,8 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -7051,8 +7052,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -7074,12 +7075,12 @@ BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -7100,7 +7101,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7112,34 +7113,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 90624 + LdsBytesNoAmax: 87040 LdsInitCVgprs: false - LdsNumBytes: 90624 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 87040 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 69632 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 154112 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 90624 - LdsOffsetMetadata_Blk: 154112 + LdsOffsetMetadata: 87040 + LdsOffsetMetadata_Blk: 148480 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7149,8 +7150,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -7161,14 +7162,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 256 - MacroTileA: 80 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7195,14 +7196,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 5 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 @@ -7212,7 +7213,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7221,17 +7222,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -7244,14 +7245,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -7266,7 +7267,7 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -7279,10 +7280,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -7298,7 +7299,7 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -7310,7 +7311,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1cxVKEoGpectXJ4hizehb-leeaygHA2aT8hzudE-aBUA= BufferLoad: true BufferStore: true CUCount: null @@ -7333,7 +7333,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: 1 - ForceUnrollSubIter: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7341,7 +7341,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7353,7 +7353,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 32 LSCB: 32 @@ -7363,24 +7363,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 128000 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 128000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7402,14 +7402,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7436,15 +7436,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 4 - NumLoadsB: 5 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7462,7 +7462,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -7472,7 +7472,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -7485,10 +7485,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7507,15 +7507,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -7534,7 +7534,7 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 - numSubTiles: 1 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -7551,7 +7551,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-XbEyQ5BisbWWMjTpMs98hyJ-dhnlHSIfClgvFcUEQk= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1U7vOvdYFB8EejoI1DeZ2XXlfHVNEE5d0H12vbNNpATs= BufferLoad: true BufferStore: true CUCount: null @@ -7582,7 +7582,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7594,8 +7594,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -7604,24 +7604,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7643,14 +7643,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7678,14 +7678,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7703,7 +7703,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -7713,7 +7713,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -7726,14 +7726,14 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -7748,8 +7748,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -7773,8 +7773,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -7792,7 +7792,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1pFiexEy_nQA9jS434_CfI7aGjSUEPvYPE95_vzPQKTw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3OvWxntshUWW3_A-VI6XNSghru3U4UDSTDhjcjVR3HKY= BufferLoad: true BufferStore: true CUCount: null @@ -7818,7 +7818,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true @@ -7835,34 +7835,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 7680 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7883,15 +7883,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7919,14 +7919,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7944,7 +7944,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -7959,22 +7959,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 8 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -7990,13 +7990,13 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -8033,7 +8033,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1H4SRQMHnBm8MPmn7CC4vovQuF9Klt0xNrGusZalzPig= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT46N5m9gwCFO-Nzc9yENVU3SI1QrpCiOXEjeEPFSn86Hk= BufferLoad: true BufferStore: true CUCount: null @@ -8058,13 +8058,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8076,34 +8076,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26624 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26624 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8124,15 +8124,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8159,15 +8159,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8185,7 +8185,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 33 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8195,27 +8195,27 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 12 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 12 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -8230,14 +8230,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -8255,8 +8255,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -8274,7 +8274,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9zD0PX1FPhAiuZDs_KjdPAn76aiPUEhRhVS2tkop5n3A= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vaiUWgr6lZ68qXC_TGMU65523uOYKq3Ec6eraxi_h38= BufferLoad: true BufferStore: true CUCount: null @@ -8317,8 +8317,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -8328,23 +8328,23 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 44544 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 44544 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 30720 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 79360 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 44544 - LdsOffsetMetadata_Blk: 79360 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8365,15 +8365,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 192 - MacroTileA: 96 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8400,15 +8400,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 - NumLoadsB: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8426,7 +8426,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 34 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8441,18 +8441,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 3 - ThreadTileA: 24 - ThreadTileB: 3 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8472,13 +8472,13 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -8496,8 +8496,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -8515,7 +8515,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19UJ9YnHb6cR5JwasXZkP1sk4AUOzy6Nd_GCne8pVOR0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1pFiexEy_nQA9jS434_CfI7aGjSUEPvYPE95_vzPQKTw= BufferLoad: true BufferStore: true CUCount: null @@ -8541,12 +8541,12 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8558,34 +8558,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 + LVCB: 16 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8606,15 +8606,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 160 + MacroTile1: 48 MacroTileA: 128 - MacroTileB: 160 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8642,14 +8642,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 4 - NumLoadsB: 5 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8667,7 +8667,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 35 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -8677,23 +8677,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8712,14 +8712,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -8756,17 +8756,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wWhUNhsLXnwdUgsCNHLafWrE-J4dg946fN2_q4HV5w8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1H4SRQMHnBm8MPmn7CC4vovQuF9Klt0xNrGusZalzPig= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -8787,7 +8787,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8799,34 +8799,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 57856 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8836,8 +8836,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -8847,15 +8847,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 80 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 80 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8883,14 +8883,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8899,7 +8899,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8908,37 +8908,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 36 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -8953,23 +8953,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -8978,8 +8978,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -8997,6 +8997,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1rXTuKdoyxclHidecoClA0AfcqpENiqcNipr2eRQNhwg= BufferLoad: true BufferStore: true CUCount: null @@ -9027,7 +9028,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9039,7 +9040,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 LSCB: 64 @@ -9049,24 +9050,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 79872 LdsInitCVgprs: false LdsNumBytes: 79872 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 79872 - LdsOffsetMetadata_Blk: 177152 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -9088,14 +9089,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9122,15 +9123,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 5 + NumElementsPerBatchStore: 8 NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 - NumLoadsB: 8 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9148,7 +9149,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 37 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -9158,7 +9159,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -9171,10 +9172,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9193,15 +9194,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -9237,7 +9238,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT31zhAEgAY7p8a5We0BzsiowfNKry6VBUHEQOZboSmQsk= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9zD0PX1FPhAiuZDs_KjdPAn76aiPUEhRhVS2tkop5n3A= BufferLoad: true BufferStore: true CUCount: null @@ -9247,7 +9248,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -9280,39 +9281,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 44544 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 79360 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 79360 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -9328,15 +9329,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 48 - MacroTileA: 32 - MacroTileB: 48 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9363,14 +9364,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 @@ -9389,12 +9390,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 38 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -9405,16 +9406,16 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 24 ThreadTile1: 3 - ThreadTileA: 8 + ThreadTileA: 24 ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true @@ -9441,16 +9442,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -9459,8 +9460,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -9478,6 +9479,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19UJ9YnHb6cR5JwasXZkP1sk4AUOzy6Nd_GCne8pVOR0= BufferLoad: true BufferStore: true CUCount: null @@ -9487,7 +9489,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -9520,39 +9522,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 81920 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 81920 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 56832 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -9568,15 +9570,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [2, 2] MIWaveTile: [4, 5] MIWaveTileA: 4 MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 80 + MacroTile1: 160 MacroTileA: 128 - MacroTileB: 80 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9604,13 +9606,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 10 - NumLoadsA: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 @@ -9629,12 +9631,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 39 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -9645,9 +9647,9 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -9681,16 +9683,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -9699,8 +9701,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -9718,16 +9720,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wWhUNhsLXnwdUgsCNHLafWrE-J4dg946fN2_q4HV5w8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -9743,7 +9746,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true @@ -9760,34 +9763,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 7680 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -9797,8 +9800,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -9809,14 +9812,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] + MIWaveTile: [2, 5] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 48 + MacroTile1: 80 MacroTileA: 128 - MacroTileB: 48 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9844,14 +9847,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9860,7 +9863,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9869,12 +9872,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 40 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -9893,9 +9896,9 @@ SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 5 ThreadTileA: 8 - ThreadTileB: 3 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9922,15 +9925,15 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -9939,8 +9942,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -9958,7 +9961,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45Pf77PZ0y2a63oqBv5xuSEpvYfkv5rrfz2OggkgqUbU= BufferLoad: true BufferStore: true CUCount: null @@ -9968,7 +9970,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -9983,13 +9985,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10001,39 +10003,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 26112 + LdsBytesNoAmax: 81920 LdsInitCVgprs: false - LdsNumBytes: 26112 - LdsNumElementsAlignedA: 7680 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 7680 - LdsOffsetB_Blk: 40448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 26112 - LdsOffsetMetadata_Blk: 40448 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -10049,15 +10051,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 128 - MacroTileA: 48 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10084,15 +10086,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 6 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10110,33 +10112,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 2 - ThreadTileA: 12 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10155,23 +10157,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -10180,8 +10182,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -10199,7 +10201,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-PpbdGeEL52tGWSKuV1d3aERKjhvbI9dAbn_gKJooqY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KkEUdHGIe6dKJ0zzMlcYFBbwtI1FwPuq4-b2hkE_RVc= BufferLoad: true BufferStore: true CUCount: null @@ -10242,8 +10244,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -10255,21 +10257,21 @@ LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 51200 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59904 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -10290,15 +10292,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 5] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 320 - MacroTileA: 64 - MacroTileB: 320 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10326,14 +10328,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 2 - NumLoadsB: 10 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10351,7 +10353,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -10366,22 +10368,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 5 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 5 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -10403,7 +10405,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -10421,8 +10423,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -10440,7 +10442,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3v1NwFnVXJTJCep_voPt7RyBBkZRSCMUMhO_N-SZ5gL8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65_iMlfIkk8B93jR0j9fItJcWD8qLSyBShDieS7L1wt0= BufferLoad: true BufferStore: true CUCount: null @@ -10450,7 +10452,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -10483,39 +10485,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 31232 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 31232 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 37376 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -10531,15 +10533,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10567,14 +10569,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10592,12 +10594,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -10607,18 +10609,18 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10638,22 +10640,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -10662,8 +10664,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -10681,17 +10683,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3OvWxntshUWW3_A-VI6XNSghru3U4UDSTDhjcjVR3HKY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9gGJDLTTlViLZGJavc0sPMnxgvCGjSJIqu4gt9wnKCkU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -10712,7 +10714,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10724,34 +10726,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 55296 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 37376 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -10761,8 +10763,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -10772,15 +10774,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10807,15 +10809,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10824,7 +10826,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10833,37 +10835,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 44 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -10878,23 +10880,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -10922,7 +10924,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6C54sGtPjCJ3V-cPRAE3Ns8NHt0h_voSZTrp24ROARTw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45ulS7WcdI7UNW-ipBhI5_9g0NWEweK1v4iw5AdMdwyw= BufferLoad: true BufferStore: true CUCount: null @@ -10947,13 +10949,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10965,34 +10967,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11014,14 +11016,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11048,14 +11050,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -11074,7 +11076,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 45 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -11084,7 +11086,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -11097,10 +11099,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11119,8 +11121,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -11163,16 +11165,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45Pf77PZ0y2a63oqBv5xuSEpvYfkv5rrfz2OggkgqUbU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -11187,13 +11190,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11205,34 +11208,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11242,8 +11245,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -11253,14 +11256,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 48 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 48 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -11288,15 +11291,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11305,7 +11308,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11314,37 +11317,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 46 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -11359,23 +11362,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -11403,7 +11406,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6uAdk8COtu0uSDFSBZJBI7WxTlIvedN3IPIedLDwGas0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-PpbdGeEL52tGWSKuV1d3aERKjhvbI9dAbn_gKJooqY= BufferLoad: true BufferStore: true CUCount: null @@ -11446,8 +11449,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -11457,13 +11460,13 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 43520 + LdsNumBytes: 59904 LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedB: 51200 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -11472,7 +11475,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 + LdsOffsetMetadata: 59904 LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 @@ -11495,14 +11498,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] + MIWaveTile: [4, 5] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 256 + MacroTile1: 320 MacroTileA: 64 - MacroTileB: 256 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11530,14 +11533,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 2 - NumLoadsB: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 10 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11555,7 +11558,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 47 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -11579,9 +11582,9 @@ SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 5 ThreadTileA: 16 - ThreadTileB: 4 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11601,7 +11604,7 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -11625,8 +11628,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -11644,17 +11647,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xL-j8Nchxvh82x3gA1ZugFs4j6lELgVzV2fA-hmOsN8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zKzQos5jBY-oZsuABvfgC-YVr0XQkeZ9FoaVzwSEWkA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -11669,13 +11672,13 @@ ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11687,34 +11690,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT112x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS7_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 67584 + LdsBytesNoAmax: 48640 LdsInitCVgprs: false - LdsNumBytes: 67584 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 48640 + LdsNumElementsAlignedA: 17920 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 83456 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 67584 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 48640 + LdsOffsetMetadata_Blk: 83456 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11724,8 +11727,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -11735,15 +11738,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 3] + MIWaveTileA: 7 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 112 + MacroTile1: 192 + MacroTileA: 112 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11770,15 +11773,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 7 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11787,7 +11790,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11796,33 +11799,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 48 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT112x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS7_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 28 + ThreadTile1: 3 + ThreadTileA: 28 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11841,23 +11844,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -11885,6 +11888,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3v1NwFnVXJTJCep_voPt7RyBBkZRSCMUMhO_N-SZ5gL8= BufferLoad: true BufferStore: true CUCount: null @@ -11927,7 +11931,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 @@ -11938,23 +11942,23 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 94720 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 94720 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 71680 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 154112 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 94720 - LdsOffsetMetadata_Blk: 154112 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11976,14 +11980,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [10, 7] - MIWaveTileA: 10 - MIWaveTileB: 7 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 448 - MacroTileA: 160 - MacroTileB: 448 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12010,15 +12014,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 280 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 5 - NumLoadsB: 14 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12036,7 +12040,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 49 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -12059,10 +12063,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 7 - ThreadTileA: 40 - ThreadTileB: 7 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12082,14 +12086,14 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -12125,6 +12129,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6C54sGtPjCJ3V-cPRAE3Ns8NHt0h_voSZTrp24ROARTw= BufferLoad: true BufferStore: true CUCount: null @@ -12134,7 +12139,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -12167,39 +12172,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 66560 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 66560 - LdsOffsetB_Blk: 197632 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 99840 - LdsOffsetMetadata_Blk: 197632 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -12215,15 +12220,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [8, 4] - MIWaveTileA: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12251,13 +12256,13 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -12276,12 +12281,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 50 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -12292,21 +12297,21 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 4 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -12328,16 +12333,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -12365,7 +12370,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1EcUjFXB929D0f2TV76fVey2j0aZunr_I-f76y9iEKD8= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6uAdk8COtu0uSDFSBZJBI7WxTlIvedN3IPIedLDwGas0= BufferLoad: true BufferStore: true CUCount: null @@ -12396,7 +12401,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12408,8 +12413,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -12418,24 +12423,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -12456,15 +12461,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12492,14 +12497,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12517,7 +12522,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 51 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -12527,23 +12532,23 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12562,14 +12567,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -12587,8 +12592,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -12610,12 +12615,12 @@ BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -12648,34 +12653,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -12685,8 +12690,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -12696,15 +12701,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] + MIWaveGroup: [2, 2] MIWaveTile: [4, 4] MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12734,11 +12739,11 @@ NumElementsPerBatchStore: 8 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -12748,7 +12753,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12757,12 +12762,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 52 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -12772,10 +12777,10 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -12809,16 +12814,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -12827,8 +12832,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -12846,7 +12851,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6mLBE_Dn-_3QTzaoio4b0fzaYzT15iyos9J3lxNRut4U= BufferLoad: true BufferStore: true CUCount: null @@ -12889,7 +12893,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 @@ -12902,21 +12906,21 @@ LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24576 + LdsBytesNoAmax: 94720 LdsInitCVgprs: false - LdsNumBytes: 24576 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 94720 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 71680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 94720 + LdsOffsetMetadata_Blk: 154112 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -12937,15 +12941,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 7] + MIWaveTileA: 10 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 160 + MacroTile1: 448 + MacroTileA: 160 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12972,15 +12976,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 5 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 14 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12998,7 +13002,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 53 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -13013,22 +13017,22 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 40 + ThreadTile1: 7 + ThreadTileA: 40 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -13050,8 +13054,8 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -13087,17 +13091,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3UA89VtDjVMYF_nVEcLHiT6ehHteA9v56y_f3v5d_CYI= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hKaBSm-LDSAQofLbqhwHQZEeIsXzaeyVIdwWTVQFEe8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -13130,45 +13134,45 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -13178,15 +13182,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13214,8 +13218,8 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -13230,7 +13234,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13239,12 +13243,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 54 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -13254,10 +13258,10 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 @@ -13269,7 +13273,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -13291,16 +13295,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -13309,8 +13313,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -13328,6 +13332,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6sFfDRoimNH8imnTTvcLS2cE_cU4dhe0vq9JydCcQHaY= BufferLoad: true BufferStore: true CUCount: null @@ -13358,7 +13363,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13370,8 +13375,8 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -13380,24 +13385,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100864 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 100864 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 92160 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 100864 - LdsOffsetMetadata_Blk: 139776 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 74752 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -13418,15 +13423,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 9] - MIWaveTileA: 4 - MIWaveTileB: 9 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 576 + MacroTile1: 160 MacroTileA: 64 - MacroTileB: 576 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13454,14 +13459,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 2 - NumLoadsB: 18 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 18 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13479,7 +13484,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 55 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -13489,27 +13494,27 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 9 - ThreadTileA: 16 - ThreadTileB: 9 + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 1 + UnrollLoopSwapGlobalReadOrder: 0 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -13524,15 +13529,15 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -13549,8 +13554,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -13577,7 +13582,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -13598,7 +13603,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13610,39 +13615,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 22528 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 37888 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -13658,15 +13663,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 48 + MacroTile1: 128 MacroTileA: 32 - MacroTileB: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13694,14 +13699,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 6 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13719,33 +13724,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 56 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -13764,23 +13769,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -13789,8 +13794,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -13808,7 +13813,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hbz-dyf7FwiPWJ4x45PK0ZJquMKV4n_z15-8SqAfiXQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1EcUjFXB929D0f2TV76fVey2j0aZunr_I-f76y9iEKD8= BufferLoad: true BufferStore: true CUCount: null @@ -13818,7 +13823,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -13839,7 +13844,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13851,39 +13856,39 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 164352 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -13899,15 +13904,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13936,13 +13941,13 @@ NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13960,37 +13965,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 57 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14005,23 +14010,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -14030,14 +14035,14 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -14071,7 +14076,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: 1 - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -14079,7 +14084,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14091,7 +14096,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 32 LSCB: 32 @@ -14101,24 +14106,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -14140,13 +14145,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [10, 4] - MIWaveTileA: 10 + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 64 MacroTile1: 256 - MacroTileA: 160 + MacroTileA: 64 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -14174,14 +14179,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -14200,7 +14205,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 58 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -14210,7 +14215,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -14223,9 +14228,9 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 40 + ThreadTile0: 16 ThreadTile1: 4 - ThreadTileA: 40 + ThreadTileA: 16 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -14245,7 +14250,7 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -14272,7 +14277,7 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -14289,6 +14294,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6mLBE_Dn-_3QTzaoio4b0fzaYzT15iyos9J3lxNRut4U= BufferLoad: true BufferStore: true CUCount: null @@ -14319,6 +14325,2654 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3UA89VtDjVMYF_nVEcLHiT6ehHteA9v56y_f3v5d_CYI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128000 + LdsInitCVgprs: false + LdsNumBytes: 128000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6Q86Al2n33vUP8nQtrYZlZYGsrzmmIC_0MPgq_9eYXoc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100864 + LdsInitCVgprs: false + LdsNumBytes: 100864 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 92160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 100864 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 576 + MacroTileA: 64 + MacroTileB: 576 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 2 + NumLoadsB: 18 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 18 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hbz-dyf7FwiPWJ4x45PK0ZJquMKV4n_z15-8SqAfiXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xL-j8Nchxvh82x3gA1ZugFs4j6lELgVzV2fA-hmOsN8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT67lNyUdTmw-cOQZDz4eeTMX4-DhVCJMsSaMIubs8z554= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6eM9UlY0usrKvssU3Ar5zZiiJpm6PCupFKVpS-ZFqsX0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ByCgfye7rWIPQ1YJohmIQ0lNCn8NrEKn7vXmxjyaZFM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6et-9-ykpgaai9G-L1VaR5RDg1xNbnI7j11JfZuQQm7o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -14331,34 +16985,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 51200 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59904 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -14368,8 +17022,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -14380,14 +17034,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 5] + MIWaveTile: [4, 1] MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 320 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 320 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14415,14 +17069,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 2 - NumLoadsB: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14431,7 +17085,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14439,13 +17093,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 59 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -14464,9 +17118,9 @@ SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 5 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 5 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14493,15 +17147,15 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -14529,7 +17183,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT67lNyUdTmw-cOQZDz4eeTMX4-DhVCJMsSaMIubs8z554= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZglzLcUOj7D9-4HA3xQ6qm78zoBYVUYUrp0ST-4N2Tc= BufferLoad: true BufferStore: true CUCount: null @@ -14560,7 +17214,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14572,7 +17226,7 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 @@ -14582,24 +17236,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 78336 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 78336 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 78336 + LdsOffsetMetadata_Blk: 154112 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -14621,14 +17275,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 160 + MacroTile1: 384 + MacroTileA: 160 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14655,15 +17309,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 5 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14680,8 +17334,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 60 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -14691,7 +17345,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -14704,10 +17358,10 @@ SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14726,8 +17380,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -14770,17 +17424,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6et-9-ykpgaai9G-L1VaR5RDg1xNbnI7j11JfZuQQm7o= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HKFsR1xpqIqB8iGFlETrmdcYqlXRxk0kl_LfZapGERo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -14801,7 +17455,7 @@ GlobalSplitUAlgorithm: MultipleBufferSingleKernel GlobalSplitUCoalesced: true GlobalSplitUWorkGroupMappingRoundRobin: true - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14813,34 +17467,34 @@ SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -14850,8 +17504,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] @@ -14861,10 +17515,10 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 64 @@ -14898,13 +17552,13 @@ NonTemporalWS: 0 NumElementsPerBatchStore: 8 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14913,7 +17567,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14921,34 +17575,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 61 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14967,23 +17621,23 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, -1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 @@ -14992,150 +17646,172 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - [2, 3, 0, 1] -- - - [233, 128, 1024, 32] - - [0, 17.77] - - - [56, 131072, 1, 233] - - [1, 31.76] - - - [128, 1024, 1, 128] - - [2, 3.08] +- - - [128, 1024, 1, 128] + - [0, 3.08] - - [2011, 1024, 1, 512] - - [3, 58.83] + - [1, 58.83] - - [128, 1024, 1, 512] - - [4, 9.27] + - [2, 9.27] - - [128, 1024, 1, 4096] - - [5, 31.71] + - [3, 31.71] - - [512, 1024, 1, 2011] - - [6, 46.44] + - [4, 46.44] - - [128, 1024, 1, 64] - - [7, 1.72] + - [5, 1.72] - - [128, 1024, 1, 144] - - [8, 2.98] - - - [256, 1024, 1, 7968] - - [9, 58.1] + - [6, 2.98] - - [128, 1024, 1, 72] - - [10, 1.7] + - [7, 1.7] - - [128, 1024, 1, 256] - - [11, 5.58] + - [8, 5.58] - - [7456, 1024, 1, 128] - - [12, 58.56] + - [9, 58.56] - - [7968, 1024, 1, 256] - - [13, 77.41] + - [10, 77.41] - - [128, 1024, 1, 96] - - [14, 2.37] + - [11, 2.37] - - [128, 233, 1024, 32] - - [15, 25.13] + - [12, 25.13] - - [120, 8192, 1, 256] - - [16, 24.98] + - [13, 24.98] - - [120, 8192, 1, 512] - - [17, 36.23] + - [14, 36.23] - - [128, 8192, 1, 64] - - [18, 10.73] + - [15, 10.73] + - - [128, 8192, 1, 128] + - [16, 18.51] - - [128, 8192, 1, 512] - - [19, 41.14] - - - [128, 8192, 1, 4352] - - [20, 73.84] - - - [128, 8192, 1, 5120] - - [21, 76.51] + - [17, 41.14] + - - [128, 8192, 1, 5640] + - [18, 70.94] + - - [128, 8192, 1, 6912] + - [19, 76.65] + - - [128, 8192, 1, 10880] + - [20, 78.44] - - [128, 98304, 1, 256] - - [22, 73.12] + - [21, 73.12] - - [256, 8192, 1, 120] - - [23, 27.41] + - [22, 27.41] - - [256, 8192, 1, 128] - - [24, 32.62] - - - [256, 8192, 1, 512] - - [25, 58.9] - - - [512, 8192, 1, 1024] - - [26, 90.42] - - - [512, 8192, 1, 2048] - - [27, 102.4] + - [23, 32.62] + - - [256, 8192, 1, 256] + - [24, 45.29] + - - [256, 8192, 1, 528] + - [25, 52.71] + - - [256, 8192, 1, 4608] + - [26, 92.0] - - [528, 8192, 1, 256] - - [28, 57.71] + - [27, 57.71] - - [528, 8192, 1, 512] - - [29, 74.32] + - [28, 74.32] + - - [1980, 8192, 1, 512] + - [29, 98.91] - - [2440, 8192, 1, 128] - - [15, 59.84] - - - [5640, 8192, 1, 128] - - [30, 66.59] - - - [61, 128, 8192, 40] - - [31, 14.89] + - [12, 59.84] + - - [3072, 8192, 1, 512] + - [30, 110.5] + - - [5120, 8192, 1, 128] + - [31, 69.7] + - - [32, 128, 8192, 4] + - [32, 2.59] + - - [36, 128, 8192, 16] + - [33, 8.31] + - - [128, 30, 8192, 4] + - [34, 2.44] - - [128, 33, 8192, 16] - - [32, 8.65] + - [35, 8.65] - - [128, 61, 8192, 40] - - [33, 18.88] + - [36, 18.88] + - - [256, 17711, 1, 887] + - [37, 78.04] - - [960, 17711, 1, 128] - - [34, 55.24] + - [38, 55.24] - - [2480, 17711, 1, 128] - - [35, 63.59] + - [39, 63.59] - - [128, 17711, 1, 256] - - [36, 41.1] - - - [17711, 246, 1, 384] - - [37, 63.33] - - - [41, 17711, 1, 128] - - [38, 10.94] + - [40, 41.1] - - [128, 17711, 1, 928] - - [39, 60.29] - - - [124, 48, 17711, 20] - - [40, 10.87] + - [41, 60.29] + - - [128, 17711, 1, 252] + - [42, 34.87] + - - [128, 17711, 1, 128] + - [43, 27.26] + - - [96, 17711, 1, 768] + - [44, 51.2] + - - [48, 124, 17711, 20] + - [45, 12.27] - - [41, 128, 17711, 6] - - [41, 3.24] + - [46, 3.24] - - [64, 819200, 1, 64] - - [42, 34.01] + - [47, 34.01] + - - [112, 655360, 1, 192] + - [48, 60.1] - - [32, 262144, 1, 60] - - [43, 17.58] + - [49, 17.58] - - [32, 262144, 1, 57] - - [44, 17.74] + - [32, 17.74] - - [64, 131072, 1, 64] - - [45, 36.24] + - [50, 36.24] + - - [64, 102400, 1, 64] + - [51, 31.44] - - [512, 4096, 1, 7680] - - [46, 105.67] - - - [64, 806154, 1, 288] - - [47, 57.26] - - - [512, 4096, 1, 4132] - - [48, 93.58] + - [52, 105.67] - - [304, 655360, 1, 644] - - [49, 94.46] - - - [256, 4096, 1, 4132] - - [50, 73.06] + - [53, 94.46] + - - [256, 4096, 1, 256] + - [54, 27.9] + - - [192, 160, 4096, 48] + - [55, 26.38] - - [64, 57, 4096, 32] - - [24, 15.56] + - [23, 15.56] + - - [32, 262144, 1, 84] + - [56, 18.4] - - [200, 32, 4096, 64] - - [51, 18.97] + - [57, 18.97] - - [64, 4096, 96, 160] - - [52, 45.87] + - [58, 45.87] - - [64, 82, 4096, 32] - - [53, 18.45] + - [59, 18.45] - - [64, 4096, 1, 2048] - - [54, 29.94] - - - [2048, 4096, 1, 128] - - [30, 53.21] + - [60, 29.94] + - - [9216, 4096, 1, 512] + - [61, 114.52] + - - [512, 4096, 1, 512] + - [62, 58.22] - - [64, 131072, 1, 128] - - [55, 40.14] + - [63, 40.14] + - - [82, 4096, 1, 2048] + - [64, 31.41] - - [160, 4096, 1, 512] - - [56, 28.98] + - [65, 28.98] - - [128, 4096, 1, 1600] - - [57, 40.1] - - - [2560, 4096, 1, 4096] - - [58, 133.67] + - [64, 40.1] - - [320, 4096, 1, 72] - - [53, 11.57] - - - [64, 752863, 1, 224] - - [59, 55.19] + - [59, 11.57] + - - [512, 4096, 1, 2246] + - [66, 82.64] - - [64, 527553, 1, 224] - - [60, 54.03] - - - [57, 4096, 1, 2048] - - [54, 27.48] + - [67, 54.03] + - - [320, 4096, 1, 28] + - [68, 5.38] + - - [28, 4096, 1, 256] + - [69, 4.86] - - [512, 4096, 1, 64] - - [61, 18.43] - - - [256, 4096, 1, 180] - - [23, 20.26] + - [70, 18.43] + - - [160, 642, 4096, 48] + - [71, 29.42] + - - [256, 4096, 1, 28] + - [22, 4.69] + - - [512, 4096, 1, 160] + - [72, 37.99] - null - null - DeviceEfficiency